多进程实例——爬取百度贴吧

上次介绍了多进程并发相关内容,本次以爬取百度贴吧为例,进行实战演示。

爬去的网址:http://tieba.baidu.com/p/3522395718

本次爬去每层楼的发帖人、发帖内容和发帖时间。

闲话不说直接上代码

 1 # -*- coding: utf-8 -*-
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import json
 5 from multiprocessing import Pool
 6 import time
 7 
 8 urls = ['http://tieba.baidu.com/p/3522395718?pn={}'.format(i) for i in range(1,20)]
 9 
10 def return_infos(url):
11     html = requests.get(url)
12     soup = BeautifulSoup(html.content,'lxml')
13     items = soup.select('div.p_postlist div.l_post.j_l_post.l_post_bright')
14     contents = soup.select('div.p_postlist div.d_post_content.j_d_post_content.clearfix')
15     names = soup.select('div.p_postlist li.d_name a')
16     for i,j,k in zip(items,contents,names):
17         item = json.loads(i.get('data-field'))
18         date = item['content']['date']
19         content = j.text.strip()
20         name = k.text.strip()
21         print(name,date,content)
22 
23 if __name__ == '__main__':
24     t0 = time.time()
25     for ix in urls:
26         return_infos(ix)
27     t1 = time.time()
28 
29     pool = Pool(4)
30     pool.map(return_infos,urls)
31     pool.close()  # 关闭进程池,不再接受新的进程
32     pool.join()  # 主进程阻塞等待子进程的退出
33     t2 = time.time()
34     print("正常执行的时间:", (t1 - t0))
35     print("并行执行时间:", (t2 - t1))
 1 # -*- coding: utf-8 -*-
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import json
 5 from multiprocessing import Pool
 6 import time
 7 
 8 urls = ['http://tieba.baidu.com/p/3522395718?pn={}'.format(i) for i in range(1,20)]
 9 
10 def return_infos(url):
11     html = requests.get(url)
12     soup = BeautifulSoup(html.content,'lxml')
13     items = soup.select('div.p_postlist div.l_post.j_l_post.l_post_bright')
14     contents = soup.select('div.p_postlist div.d_post_content.j_d_post_content.clearfix')
15     names = soup.select('div.p_postlist li.d_name a')
16     for i,j,k in zip(items,contents,names):
17         item = json.loads(i.get('data-field'))
18         date = item['content']['date']
19         content = j.text.strip()
20         name = k.text.strip()
21         print(name,date,content)
22 
23 if __name__ == '__main__':
24     t0 = time.time()
25     for ix in urls:
26         return_infos(ix)
27     t1 = time.time()
28 
29     pool = Pool(4)
30     pool.map(return_infos,urls)
31     pool.close()  # 关闭进程池,不再接受新的进程
32     pool.join()  # 主进程阻塞等待子进程的退出
33     t2 = time.time()
34     print("正常执行的时间:", (t1 - t0))
35     print("并行执行时间:", (t2 - t1))

爬取结果:

1 正常执行的时间: 16.037917375564575
2 并行执行时间: 6.655380487442017
原文地址:https://www.cnblogs.com/freeman818/p/7197600.html