python之爬虫(二)

 1 #!/usr/bin/env python3
 2 # -*- coding: utf-8 -*-
 3 # author:Momo time:2018/6/30
 4 
 5 """
 6     目标网站:http://tieba.baidu.com/p/3522395718
 7     目标内容:跟帖用户名,跟帖内容,跟帖时间
 8     涉及知识:
 9                 Requests 获取网页
10                 XPath 提取内容
11                 map 实现多线程爬虫
12     掌握以下知识:使用xpath进行网页提取
13                  使用map实现多线程爬虫
14 """
15 
16 from lxml import etree
17 from multiprocessing.dummy import Pool as ThreadPool
18 import urllib.request
19 import json
20 # from imp import reload
21 
22 # # "将贴吧拷下的代码保存为utf-8"
23 # import sys
24 # reload(sys)
25 # sys.setdefaultencoding('utf-8')
26 
27 def towrite(contentdict):
28     f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n' )
29     f.writelines(u'回帖内容:' + contentdict['topic_reply_content'] + '\n')
30     f.writelines(u'回帖人:' + str(contentdict['user_name']) + '\n\n')
31 
32 def spider(url):
33     html_page = urllib.request.urlopen(url)
34     html_code = html_page.read().decode('utf-8')
35     selector = etree.HTML(html_code)
36     contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
37     item = {}
38     for each in contetnt_field:
39         reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
40         author = reply_info['author']['user_name']
41         content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
42         reply_time = reply_info['content']['date']
43         print(content)
44         print(reply_time)
45         print(author)
46         item['user_name'] = author
47         item['topic_reply_content'] = content
48         item['topic_reply_time'] = reply_time
49         towrite(item)
50 
51 if __name__ == '__main__':
52     pool = ThreadPool(4)
53     f = open('content.txt', 'a',encoding='utf-8')
54     page = []
55     for i in range(1,21):
56         newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
57         page.append(newpage)
58 
59     results = pool.map(spider, page )
60     pool.close()
61     pool.join()
62     f.close()
原文地址:https://www.cnblogs.com/momo072994MLIA/p/9249637.html