python自动化之web抓取

'''

从web抓取数据:

webbrowser:是python自带的,打开浏览器获取指定页面.

requests:从因特网上下载文件和网页.

Beautiful Soup:解析HTML,即网页编写的格式.

selenium:启动并控制一个Web浏览器.selenium能够填写表单,并模拟鼠标在这个浏览器中点击

'''

import webbrowser

webbrowser.open('http://inventwithpython.com/')

'''

利用requests模块从Web下载文件:

requests模块让你很容易从Web下载文件,不必担心一些复杂的问题,

诸如网络错误、连接问题和数据压缩

'''

###################################用requests.get()下载一个网页####################################

import requests

res=requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')

type(res)

res.status_code=requests.codes.ok

'''

检查Response对象的status_code属性,等于requests.codes.ok时表示一切都好

(HTTP协议中"OK"的状态码是200,404状态码表示"没找到")

'''

len(res.text)

print(res.text[:250])

###################################检查下载错误####################################

import requests

res=requests.get('http://inventwithpython.com/page_that_does_not_exist')

res.raise_for_status() ####下载成功了,就什么也不做;下载出错,就抛出异常

##############################################

import requests

res=requests.get('http://inventwithpython.com/page_that_does_not_exist')

try:

res.raise_for_status()

except Exception as exc:

print('There was a problem:%s'%(exc))

###################################将下载的文件保存到硬盘####################################

import requests

res=requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')

res.raise_for_status()

playFile=open(r'C:UsersAdministratorDesktopRomeoAndJuliet.txt','wb')

for chunk in res.iter_content(100000): ###每次循环迭代的字节数

print len(chunk)

playFile.write(chunk)

playFile.close()

###################################学习HTML的资源####################################

'''

HTML初学者指南:

http://htmldog.com/guides/html/beginner/

http://www.codecademy.com/tracks/web/

https://developer.mozilla.org/en-US/learn/html

'''

######HTML快速复习

<strong>Hello</strong>world! #####<strong>表明：标签包围的文本将使用粗体

Al''s free <a href="http://inventwithpython.com">Python books</a>

###################################查看网页的HTML源代码####################################

'''

在网页任意位置点击右键:选择View Source或View page source

查看该页的HTML文本

'''

在Windows版的Chrome和IE中,开发者工具已经安装了,可以按下F12,出现;

再次按下F12,可以让开发者工具消失

'''

不要用正则表达式来解析HTML:

尝试用正则表达式来捕捉HTML格式的变化,非常繁琐,容易出错

专门用于解析HTML的模块,诸如Beautiful Soup,将更不容易导致缺陷

http://stackoverflow.com/a/1732454/1893164/

'''

###################################使用开发者工具来寻找HTML元素####################################

'''

http://weather.gov/

邮政编码为94105

通过开发者工具,找到对应代码

'''

###################################从HTML创建一个BeautifulSoup对象####################################

import requests,bs4

res=requests.get('http://forecast.weather.gov/MapClick.php?lat=37.78833550000007&lon=-122.39552170000002#.WXazEmP9c_0')

res.raise_for_status()

noStarchSoup=bs4.BeautifulSoup(res.text)

type(noStarchSoup)

playFile=open(r'C:UsersAdministratorDesktop est.html','wb')

for chunk in res.iter_content(100000): ###每次循环迭代的字节数

print len(chunk)

playFile.write(chunk)

playFile.close()

###################################用select()方法寻找元素####################################

'''

传递给select()方法的选择器将匹配...

soup.select('div') 所有名为<div>的元素

soup.select('#author') 带有id属性为author的元素

soup.select('.notice') 所有使用CSS class属性名为notice的元素

soup.select('div span') 所有在<div>元素之内的<span>元素

soup.select('div > span') 所有直接在<div>元素之内的<span>元素,中间没有其他元素

soup.select('input[name]') 所有名为<input>,并有一个name属性,其值无所谓的元素

soup.select('input[type="button"]') 所有名为<input>,并有一个type属性,其值为button的元素

'''

</div>

</div>

import bs4

exampleFile=open(r'C:UsersAdministratorDesktop est.html')

exampleSoup=bs4.BeautifulSoup(exampleFile.read())

elems=exampleSoup.select('#current_conditions-summary')

type(elems)

len(elems)

type(elems[0])

elems[0].getText()

>>> elems[0].getText()

u' NA 59xb0F 15xb0C '

>>> str(elems[0])

'<div class="pull-left" id="current_conditions-summary"> <p class="myforecast-current">NA</p> <p class="myforecast-current-lrg">59xc2xb0F</p> <p class="myforecast-current-sm">15xc2xb0C</p> </div>'

>>> >>> elems[0].attrs

{'id': 'current_conditions-summary', 'class': ['pull-left']}

#########################

pElems=exampleSoup.select('p')

>>> pElems[1]

<p>Your local forecast office is</p>

>>> pElems[2]

<p>

Severe thunderstorms will be possible over portions of the upper Midwest and Great Lakes Tuesday, Wednesday, and Thursday. Damaging winds, large hail, and heavy rainfall possible. Over the Desert Southwest and portions of the Rockies, Monsoonal moisture will lead to locally heavy rainfall and the threat for flash flooding into midweek.

</p>

>>> pElems[1].getText()

u'Your local forecast office is'

###################################通过元素的属性获取数据####################################

import bs4

soup=bs4.BeautifulSoup(open(r'C:UsersAdministratorDesktop est.html'))

spanElem=soup.select('span')[0]

>>> str(spanElem)

'<span class="sr-only">Toggle navigation</span>'

>>> spanElem.get('class')

['sr-only']

>>> spanElem.attrs

{'class': ['sr-only']}

>>> spanElem.get('id')==None

True

###################################用selenium模块控制浏览器####################################

###################################启动selenium控制的浏览器####################################

#####下载:http://getfirefox.com/

from selenium import webdriver

browser=webdriver.Firefox()

type(browser)

browser.get('http://inventwithpython.com')

###################################maplt.py####################################

import webbrowser,sys,pyperclip

if len(sys.argv)>1:

###Get address from command line:

address=' '.join(sys.argv[1:])

else:

address=pyperclip.paste()

webbrowser.open('https://www.google.com/maps/place/'+address)