初识python 之 爬虫:爬取豆瓣电影最热评论

主要用到lxml的etree解析网页代码,xpath获取HTML标签。

代码如下:

 1 #!/user/bin env python
 2 # author:Simple-Sir
 3 # time:2019/7/17 22:08
 4 # 获取豆瓣网正在上映电影最热评论
 5 import requests
 6 from lxml import etree
 7 
 8 # 伪装浏览器
 9 headers ={
10     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
11     'Referer':'https://movie.douban.com/'
12 }
13 # 获取首页网页信息并解析
14 url = 'https://movie.douban.com/cinema/nowplaying/chengdu/'
15 
16 def getUrlText(url):
17     respons = requests.get(url,headers=headers)  # 获取网页信息
18     urlText = respons.text
19     html = etree.HTML(urlText)  # 使用lxml解析网页
20     return html
21 
22 # 提取电影名称及详情地址链接列表
23 def getWallUrl(url):
24     hrefUrl = getUrlText(url)
25     ul = hrefUrl.xpath('//ul[@class="lists"]')[0]  # 获取ul标签
26     liList = ul.xpath('./li')  # # 获取li标签列表
27     liHrefs = []
28     for li in liList:
29         liHref = li.xpath('.//@href')[0]
30         name = li.xpath('@data-title')[0]
31         msg = {
32             name:liHref
33         }
34         liHrefs.append(msg)
35     return liHrefs
36 
37 # 解析电影详情地址
38 def downPL(url):
39     moveUrl = getWallUrl(url)
40     n=0
41     for murl in moveUrl:
42         n+=1
43         for d in murl:
44             plHtml = getUrlText(murl[d])
45             plTextFull = plHtml.xpath('//div[@id="hot-comments"]//span[@class="hide-item full"]//text()')
46             plTextShort = plHtml.xpath('//div[@id="hot-comments"]//span[@class="short"]//text()')
47             if(len(plTextFull)==0 and len(plTextShort)>0):
48                 print('正在写入《{}》的评论。'.format(d))
49                 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp:
50                     fp.write('{}、《{}》的最热评论是:
{}

'.format(n,d,plTextShort[0]))
51             elif(len(plTextFull)>0):
52                 print('正在写入《{}》的评论。'.format(d))
53                 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp:
54                     fp.write('{}、《{}》的最热评论是:
{}

'.format(n,d,plTextShort[0]))
55             else:
56                 print('正在写入《{}》的评论。'.format(d))
57                 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp:
58                     fp.write('{}、《{}》暂无评论!

'.format(n,d))
59     return print('{}部电影的所有评论已全部写入“豆瓣评论.txt”,请查看。'.format(n))
60 
61 if __name__ == '__main__':
62     downPL(url)
获取豆瓣网正在上映电影最热评论

执行效果:

 文件详情:

世风之狡诈多端,到底忠厚人颠扑不破; 末俗以繁华相尚,终觉冷淡处趣味弥长。
原文地址:https://www.cnblogs.com/simple-li/p/11219586.html