python之爬虫（二）

 1 #!/usr/bin/env python3
 2 # -*- coding: utf-8 -*-
 3 # author：Momo time:2018/6/30
 4 
 5 """
 6     目标网站：http://tieba.baidu.com/p/3522395718
 7     目标内容：跟帖用户名，跟帖内容，跟帖时间
 8     涉及知识：
 9                 Requests 获取网页
10                 XPath 提取内容
11                 map 实现多线程爬虫
12     掌握以下知识：使用xpath进行网页提取
13                  使用map实现多线程爬虫
14 """
15 
16 from lxml import etree
17 from multiprocessing.dummy import Pool as ThreadPool
18 import urllib.request
19 import json
20 # from imp import reload
21 
22 # # "将贴吧拷下的代码保存为utf-8"
23 # import sys
24 # reload(sys)
25 # sys.setdefaultencoding('utf-8')
26 
27 def towrite(contentdict):
28     f.writelines(u'回帖时间：' + str(contentdict['topic_reply_time']) + '\n' )
29     f.writelines(u'回帖内容：' + contentdict['topic_reply_content'] + '\n')
30     f.writelines(u'回帖人：' + str(contentdict['user_name']) + '\n\n')
31 
32 def spider(url):
33     html_page = urllib.request.urlopen(url)
34     html_code = html_page.read().decode('utf-8')
35     selector = etree.HTML(html_code)
36     contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
37     item = {}
38     for each in contetnt_field:
39         reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
40         author = reply_info['author']['user_name']
41         content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
42         reply_time = reply_info['content']['date']
43         print(content)
44         print(reply_time)
45         print(author)
46         item['user_name'] = author
47         item['topic_reply_content'] = content
48         item['topic_reply_time'] = reply_time
49         towrite(item)
50 
51 if __name__ == '__main__':
52     pool = ThreadPool(4)
53     f = open('content.txt', 'a',encoding='utf-8')
54     page = []
55     for i in range(1,21):
56         newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
57         page.append(newpage)
58 
59     results = pool.map(spider, page )
60     pool.close()
61     pool.join()
62     f.close()