王者荣耀官方壁纸爬取

任务目标:下载王者荣耀官方壁纸页面的所有页面所有规格的壁纸到指定总文件夹中,每种壁纸都有一个该壁纸名称的文件夹,该文件夹中下载的是所有规格的该壁纸

王者荣耀官方壁纸网页url为 https://pvp.qq.com/web201605/wallpaper.shtml
经查看网页结构,发现网页源代码中并没有壁纸相关信息,因此壁纸数据是动态加载的。打开chrome开发者工具,Network选项,刷新页面,找到对应发送壁纸数据的url为https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback=jQuery171041143228271859056_1605079993513&iAMSActivityId=51991&everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&=1605079993712

经测试发现,url中的page=0的属性展示的是第一页的数据,更改page的值即可获取其他页面的数据。
由于响应数据是jQuery+一串数字开头,因此可以去掉url中的jsoncallback=jQuery171041143228271859056_1605079993513参数,这样就可以返回标准格式的json数据。

查看该json数据得知,壁纸数据信息在List下,每页20个,其中的sProdImgNo_1-sProdImgNo_8属性值,表示八种不同规格的壁纸url,但该url为加密过的url,通过urllib库的parse模块的unquote方法可以获取解密后的url,解密后url后缀为200,改为0后即可获取正确规格的壁纸。

通过常规requests方法直接获取数据是单线程的,图片需下完一张才可以下另一张,效率太低,为此,可以使用生产者消费者模式来实现多线程爬取,参考代码如下:

# 爬取王者荣耀官方页面壁纸
from time import time
import requests
from urllib import parse
import os
import threading
from queue import Queue
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like'
                      ' Gecko) Chrome/81.0.4044.129 Safari/537.36'
}


class Producer(threading.Thread):
    
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.image_queue = image_queue

    def run(self):
        while not self.page_queue.empty():
            url = self.page_queue.get()
            response = requests.get(url, headers=headers)
            datas = response.json()['List']
            for data in datas:
                image_urls = extract_images(data)
                file_name = parse.unquote(data['sProdName']).replace(':', ':').strip()
                image_path = os.path.join('wangzhebizhi_images', file_name)
                for image_url in image_urls:
                    if not os.path.exists(image_path):
                        os.mkdir(image_path)
                    self.image_queue.put({'image_url': image_url, 'image_path': image_path})


class Consumer(threading.Thread):

    def __init__(self, image_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.image_queue = image_queue

    def run(self):
        while True:
            try:
                image_info = self.image_queue.get(timeout=5)
                image_url = image_info['image_url']
                image_path = image_info['image_path']
                pattern = re.compile(r'(.*?)sProdImgNo_(.).jpg')
                num = pattern.search(image_url).group(2)
                with open(os.path.join(image_path, num + '.jpg'), 'wb') as f:
                    response = requests.get(image_url, headers=headers)
                    f.write(response.content)
                print(image_path + num + '.jpg下载完成')
            except:
                print('全部下载完成')
                end_time = time()
                print('程序耗时:' + str(end_time - start_time))
                break


# 解析网页url
def extract_images(data):
    image_urls = []
    for i in range(1, 9):
        image_url = parse.unquote(data['sProdImgNo_{}'.format(i)].replace('200', '0'))
        image_urls.append(image_url)
    return image_urls


def main():
    page_queue = Queue(50)
    image_queue = Queue(1000)
    a = 2  # 要爬取多少页
    for i in range(a):
        url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/' 
              'workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20' 
              '&totalpage=0&page={}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&' 
              'everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1601534798968'.format(i)
        page_queue.put(url)

    for i in range(5):
        th = Producer(page_queue, image_queue, name='生产者{}号'.format(i+1))
        th.start()

    for i in range(20):
        th = Consumer(image_queue, name='消费者{}号'.format(i))
        th.start()


if __name__ == '__main__':
    start_time = time()
    main()

在运行该py文件前,需先在该文件同级目录下创建一个文件夹 wangzhebizhi_images
运行该py文件,可以发现,下载速度较单线程requests爬取快了很多。

本文完

原文地址:https://www.cnblogs.com/achangblog/p/13959195.html