线程池的异步爬取
import requests
from lxml import etree
url = 'https://www.qiushibaike.com/text/page/%d/'
urls = []
for page in range(1,11):
new_url = format(url%page)
urls.append(new_url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
#该函数只可以有一个参数
def get_request(url):
'''
:param url: 每个页面url
:return: 每个页面的所有作者
'''
author_list = []
res =requests.get(url,headers=headers).text
tree = etree.HTML(res)
div_list = tree.xpath('//*[@id="content-left"]/div')
for div in div_list:
author = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')
author_list.append(author)
return author_list
from multiprocessing.dummy import Pool
pool = Pool(10)
response_test_list = pool.map(get_request,urls)#自定义函数,列表或者字典
print(sum(sum(response_test_list,[]),[]))