requests+xpath+map爬取百度贴吧

 1 # requests+xpath+map爬取百度贴吧
 2 # 目标内容:跟帖用户名,跟帖内容,跟帖时间
 3 # 分解:
 4 # requests获取网页
 5 # xpath提取内容
 6 # map实现多线程爬虫
 7 import requests
 8 from requests.exceptions import RequestException
 9 from lxml import etree
10 import json
11 from multiprocessing.dummy import Pool as ThreadPool
12 
13 def get_html(url):
14     try:
15         response = requests.get(url)
16         if response.status_code == 200:
17             return response.text
18         else:
19             return None
20     except RequestException:
21         return None
22 
23 def parse_html(html):
24     selector = etree.HTML(html)
25     data = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
26     for each in data:
27         rs = each.xpath('@data-field')[0]
28         rs = json.loads(rs)
29         author = rs.get('author').get('user_name')
30         author_id = rs.get('content').get('post_id')
31         content = each.xpath('div/div/cc/div[@id="post_content_%s"]/text()'% author_id)[0].strip()
32         date = rs.get('content').get('date')
33         yield {
34             'author':author,
35             'content':content,
36             'date':date
37         }
38 
39 def save_to_txt(result):
40     print('正在存储:',result)
41 
42     with open('tieba.txt','a',encoding='utf-8') as f:
43         f.write('回帖作者:'+result['author']+'
')
44         f.write('回帖内容:'+result['content']+'
')
45         f.write('回帖时间:'+result['date']+'
')
46         f.write('
')
47 
48 
49 def main(url):
50         html = get_html(url)
51         if html:
52             for result in parse_html(html):
53                 save_to_txt(result)
54 
55 if __name__=='__main__':
56     
57     pool = ThreadPool(4)
58     urls=[]
59     base_url = 'http://tieba.baidu.com/p/3522395718?pn='
60     for page_num in range(1, 21):
61         url = base_url + str(page_num)
62         urls.append(url)
63 
64     pool.map(main,urls)
65     pool.close()
66     pool.join()