多线程爬取页面图片

详细见代码,在代码段有相应注释:

from multiprocessing import Pool
import requests
import re
import random


class dImg():
    def __init__(self):
        self.count = 0
        pass

    def run(self):
        self.userSearch = input('请输入你想要下载的图片:')
        self.userNum = int(input('你想要下载多少页面图片:'))
        self.download()

    def processRun(self):
        self.userSearch = input('请输入你想要下载的图片:')
        self.userNum = int(input('你想要下载多少页面图片:'))
        self.multiDownPage()

    def download(self):
        for item in range(self.userNum):
            self.downloadPage(item)

    def downloadPage(self, item):
        item = item + 1
        res = requests.get('https://pixabay.com/zh/photos/?q={}&pagi={}'.format(self.userSearch, item))
        # (http.*?)表示获取以http开头的字符串
        pattern = re.compile('<div class="item".*?<img.*?src="(http.*?)".*?title="(.*?)">', re.S)
        result = re.findall(pattern, res.text)
        print(res.text)
        for item in result:
            # 请求相应链接的具体图片并保存到本地
            imgContent = requests.get(item[0])
            # 图片名随机数生成,保存图片要用二进制形式保存所有此处为wb
            # imgContent.content为请求到的图片内容
            ranNum = random.randint(0, 100000000)
            with open('./image/{}.jpg'.format(ranNum), 'wb') as f:
                f.write(imgContent.content)
                print('已下载完:%s图' % ranNum)
                self.count = self.count + 1

    def multiDownPage(self):
        # 设置的进程数和输入下载页数一样
        p = Pool(self.userNum)
        # 在进程数为p下,非柱塞的一一执行self.userNum页数的任务
        for i in range(self.userNum):
            # 参数args=(i,)为一个元组,所以后面的逗号必须有,表示页数
            p.apply_async(self.downloadPage, args=(i,))
        p.close()
        # 暂停进程,即设置进程的间隔时间
        p.join()
        print('所有图片下载完毕')


if __name__ == '__main__':
    # 实例化类,并调用其主函数
    a = dImg()
    # a.run()
    a.processRun()
原文地址:https://www.cnblogs.com/Dark-fire-liehuo/p/9757130.html