第十三节 电影天堂项目实战

 1 from lxml import etree
 2 import requests
 3 
 4 
 5 baseurl = 'https://www.dytt8.net'
 6 headers = {
 7     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
 8     'Referer': 'https://www.dytt8.net/html/gndy/dyzz/index.html'
 9 }
10 def agent(ur):
11     resp = requests.get(ur,headers = headers)
12     # parse = etree.HTMLParser()
13     text = resp.text
14     html = etree.HTML(text)
15     # a = etree.tostring(html, encoding='utf-8').decode('utf-8')
16     return html
17 
18 def movie_url_list(html):
19     url = html.xpath("//table[@class='tbspan']//a/@href")
20     return  url
21 
22 def parse_info(info,rule):
23     return  info.replace(rule,'').strip()
24 
25 def xiangqingye(url):
26     resp = requests.get(url, headers=headers)
27     text = resp.content.decode('gbk')
28     html = etree.HTML(text)
29     a = html.xpath('//div[@id="Zoom"]//text()')
30     movie = {}
31     for info in a:
32         if info.startswith("◎片  名"):
33             info = parse_info(info, '◎片  名')
34             movie['pianming'] = info
35         if info.startswith("◎年  代"):
36             info = parse_info(info, '◎年  代')
37             movie['niandai'] = info
38         if info.startswith("◎产  地"):
39             info = parse_info(info, '◎产  地')
40             movie['chandi'] = info
41         if info.startswith("◎类  别"):
42             info = parse_info(info, '◎类  别')
43             movie['leixing'] = info
44         if info.startswith("◎上映日期"):
45             info = parse_info(info, '◎上映日期')
46             movie['shangyingshijian'] = info
47         if info.startswith("◎豆瓣评分"):
48             info = parse_info(info, '◎豆瓣评分')
49             movie['doubanpingfen'] = info
50         if info.startswith("◎片  长"):
51             info = parse_info(info, '◎片  长')
52             movie['pianchang'] = info
53         if info.startswith("◎标  签"):
54             info = parse_info(info, '◎标  签')
55             movie['biaoqian'] = info
56     return movie
57 
58 def alldata():
59     srt1 = 'https://www.dytt8.net/html/gndy/dyzz/list_23_'
60     str2 = '.html'
61     movies = []
62     for i in range(1,2):
63         url = srt1+str(i)+str2
64         ura = agent(url)
65         b = movie_url_list(ura)
66         for z in b:
67             c = baseurl + z
68             movielist = xiangqingye(c)
69             movies.append(movielist)
70     return movies
71 if __name__ == '__main__':
72     print(alldata())
原文地址:https://www.cnblogs.com/kogmaw/p/12506974.html