13 爬取豆瓣电影网电影信息

 1 """豆瓣电影爬虫"""
 2 
 3 
 4 import requests
 5 from lxml import etree
 6 
 7 # 1、将目标网站上的页面爬取出来
 8 headers = {
 9     'User-Agent': 'Mozilla/5.0',
10 }
11 
12 url = 'https://movie.douban.com/cinema/nowplaying/shangrao/'
13 #url = 'https://movie.douban.com/'
14 
15 response = requests.get(url, headers=headers)
16 text = response.text
17 #print(response.text)
18 with open('douban.html', 'w', encoding='utf-8') as fp:
19     fp.write(response.content.decode('utf-8'))
20 # response.text返回的是一个经过解码的字符串,是str(unicode)类型
21 # response.content返回的是一个原生的字符串是bytes类型,没有经过解码,respose.content.decode('utf-8')解码
22 
23 # 2、将数据根据一定的规则进行提取
24 movies = []
25 html = etree.HTML(text)
26 ul = html.xpath("//ul[@class='lists']")[0]
27 lis = ul.xpath("./li")
28 for li in lis:
29     title = li.xpath("@data-title")[0]
30     region = li.xpath("@data-region")[0]
31     director = li.xpath("@data-director")[0]
32     actors = li.xpath("@data-actors")[0]
33     duration = li.xpath("@data-duration")[0]
34     img = li.xpath(".//img/@src")[0]
35     release_date = li.xpath(".//li[@class='release-date']/text()")[0].strip()       # strip()去除两边空格
36     #print(release_date)
37 
38     movie = {
39         'title': title,
40         'region': region,
41         'director': director,
42         'actors': actors,
43         'duration': duration,
44         'img': img,
45         'date': release_date
46     }
47     movies.append(movie)
48 
49 for movie in movies:
50     print(movie)
原文地址:https://www.cnblogs.com/sruzzg/p/13082435.html