python3.5爬虫基础urllib结合beautifulsoup实例

beautifulsoup模块，可以替代re模块来代替正则表达式进行匹配

小例子1：用beautifulsoup爬取淘宝首页的汉字

 1 from bs4 import BeautifulSoup
 2 def tecent(url):
 3     response=urllib.request.urlopen(url)
 4     html=response.read()
 5     data=html.decode("utf-8")  #转换编码，默认转换为utf-8
 6     soup=BeautifulSoup(data,"html5lib")
 7     for list in soup.find_all("a"):
 8         if list.string==None:
 9             continue
10         else:
11             print(type(list.string))
12             print(list.string)   #暂时无法将NavigableString类型进行转换，此例子暂时在控制台输出
13             # with open("taobao1.txt","ab") as f:
14             #     f.write(list.string)
15 
16 if __name__=="__main__":
17     url="https://www.taobao.com/"
18     tecent(url)

小例子2：用Beautiful soup编写一个抓取妹子图页面图片的代码

 1 from bs4 import BeautifulSoup
 2 def taonvlang(url):
 3     res=urllib.request.urlopen(url).read()
 4     data=res.decode()
 5     soup=BeautifulSoup(data,"html5lib")   #将html代码用Bs进行处理
 6     path="G:/taonvlang/"
 7     if not os.path.isdir(path):    #如果不存在该路径，则创建路径
 8         os.makedirs(path)
 9     count=1   #用于给图片编号
10     for list in soup.find_all("img"):      #获取img的所有内容
11         print(list)   #img标签的所有内容
12         dict=list.attrs    #将该字段转换为字典
13         print(dict)
14         if "src" in dict:
15             image=dict["src"]    #取图片地址
16             # print(image)
17             img=image[image.rfind(".")::]    #取出文件扩展名
18             # print(img)
19             image_path=str(count).zfill(5)+img
20             filepath=os.path.join(path,image_path)
21             with open(filepath,"wb") as f:
22                 image_data=urllib.request.urlopen(dict["src"]).read()
23                 f.write(image_data)
24             count+=1
25 
26 if __name__=="__main__":
27     url="http://www.mzitu.com/all"
28     taonvlang(url)