初见网络爬虫

 1 #beautifulSoup
 2 from urllib.request import urlopen
 3 from bs4 import BeautifulSoup
 4 html = urlopen("http://www.pythonscraping.com/pages/page1.html")
 5 bsObj = BeautifulSoup(html,'html.parser')
 6 print(bsObj.h1)
 7 
 8 #处理异常
 9 html = urlopen("http://www.pythonscraping.com/pages/page1.html")
10 
11 #可能会发生两种异常
12 #1.网页在服务器上不存在
13 #2.服务器不存在
14 
15 #可以用下方式处理处理这种异常
16 
17 try:
18     html = urlopen("http://www.pythonscraping.com/pages/page1.html")
19     if html is None:
20         print("URL is not found")
21     else:
22         #程序继续
23 except HTTPError as e:
24     print(e)
25     #返回空值，中断程序，或者执行另一个方案
26 else:
27     #程序继续。注意：如果你已经在上面异常捕捉那一段代码里返回或中断
28     #那么就不需要使用else语句了，这段代码也不会执行
29 
30 
32 from urllib.request import urlopen
33 from urllib.error import HTTPError
34 from bs4 import BeautifulSoup
35 def getTitle(url):
36     try:
37         html = urlopen(url)
38     except HTTPError as e:
39         return None
40     try:
41         bsObj = BeautifulSoup(html,'html.parser')
42         title = bsObj.body.h1
43     except AttributeError as e:
44         return None
45     return title
46 title = getTitle("http://www.pythonscraping.com/pages/page1.html")
47 if title == None:
48     print("Title could not be found")
49 else:
50     print(title)