Python 爬虫:豆瓣电影Top250,包括电影导演、类型、年份、主演

结果输出到文本文件中。

 1 import codecs
 2 import requests
 3 from bs4 import BeautifulSoup
 4 
 5 headers={'User-Agent': 'Mozilla/5.0'}
 6 index_url = 'https://movie.douban.com/top250'
 7 
 8 def get_html(url):
 9     html = requests.get(url, headers=headers).text
10     return html
11 
12 def create_list(html):
13     soup = BeautifulSoup(html, 'lxml')
14     movie_names = []
15     movie_info = []
16     for t in soup.find_all('div', 'hd'):
17         name = t.find('span', 'title').get_text()
18         movie_names.append(name)
19     for t in soup.find_all('div', 'info'):
20         info = t.find('p').get_text().replace(' ','')
21         movie_info.append(info)
22     next_page = soup.find('span', 'next').find('a')
23     if next_page:
24         return movie_names, movie_info, index_url + next_page['href']
25     else:
26         return movie_names, movie_info, None
27 
28 def main():
29     order = 1
30     url = index_url
31     with codecs.open('top250.txt', 'wb', encoding='utf-8') as f:
32         while url:
33             html = get_html(url)
34             names, info, url = create_list(html)
35             for n in range(25):
36                 f.write('Top ' + str(order) + ' ' + names[n] + '
')
37                 f.write(info[n] + '
')
38                 order = order + 1
39 
40 if __name__ == '__main__':
41     main()
原文地址:https://www.cnblogs.com/deepcho/p/douban-movie-top250-spider.html