通过 multiprocessing Pool 线程池加速爬虫的处理

#-*- coding:utf-8 -*-

from multiprocessing import Pool
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
import os
import time
url = 'https://bing.ioliu.cn/'
headers =  {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
 }

#定义request 会话对象
requestsSession = requests.Session()
imgPage = (x for x in range(1,146))

def getImg(imgPage):
        page = imgPage
          #页数循环
        params = {
            "p": page
        }
        #请求页面
        responseHtml = requestsSession.get(url=url,headers=headers,params=params,timeout=3).text
        #通过etree 实例化,通过xpath 表达式过滤 src
        xpathObj = etree.HTML(responseHtml)
        imgSrc = xpathObj.xpath('//div[@class="container"]//div[@class="card progressive"]/img/@src')
        page =str(page)
        #以页数为目录,通过 os.path 判断下
        if not os.path.exists('./Img/' + page):
            os.mkdir('./Img/' + page)
        #获取每个页面的 图片并且存储
        for i in imgSrc:
            imgName = i.split('/')[-1].split('?')[0]
            imgData = requestsSession.get(url=i,headers=headers).content
            imgPath = './Img/' + page + '/' + imgName + '.jpg'
            with open(imgPath,'wb') as dp:
                dp.write(imgData)
                print(imgName, '下载成功!')
        # 每个页面请求间隔3s 防止被禁
        time.sleep(3)

if __name__ == "__main__":
#通过多线程池的方式进行梳理 ,向 map 方法传递一个 函数和一个列表 ,map 函数使函数和 列表一一对应进行处理
    with Pool(10) as p:
        p.map(getImg,imgPage)
 
原文地址:https://www.cnblogs.com/zy09/p/14102978.html