开线程爬取黑猫里的阿里投诉信息

仅供学习,请适度开线程

一.代码

import requests
from requests_html import HTMLSession
import time
from concurrent.futures import ThreadPoolExecutor
import json

pool = ThreadPoolExecutor(30)
big_list = []
pool_name_list =[]
session = HTMLSession()

def dewu_company(x):

    try:

        print(f'第{x+1}页')

        params = {
            'couid': '1878960481',
            'type': '1',
            'page_size': f'{(x + 1) * 10}',
            'page': f'{x + 1}',
            # 'callback':'jQuery11',
        }
        url = 'https://tousu.sina.com.cn/api/company/received_complaints'
        res = requests.get(url, params=params, verify=False)
        info_list = res.json()['result']['data']['complaints']
        for dict_info in info_list:
            dict_info['main']['url'] = 'https:' + dict_info['main']['url']
            dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
            info_url = dict_info['main']['url']
            print(info_url)
            res = session.get(info_url, verify=False)
            new_dict = dict()
            new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
            new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
            new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
            new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
            new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
            new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
            # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
            # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
            new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
            not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
            have_http_img_list = []
            for a in not_have_http_img_list:
                have_http_img_list.append('https:' + a)
            new_dict['投诉图片'] = have_http_img_list

            vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
            print(vide_id_list)
            new_vide_list = []
            if vide_id_list:
                for vide_id in vide_id_list:
                    t = int(time.time())
                    vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                    res = session.get(vide_info_url, verify=False)
                    try:
                        new_vide_list.append(res.json())
                    except:
                        pass
            new_dict['投诉视频详情'] = new_vide_list
            dict_info['投诉详情'] = new_dict
            big_list.append(dict_info)
    except:
        print('错误跳过这一页')

def run(page):
    '''爬取的页面数量'''
    for x in range(page):
        name = pool.submit(dewu_company,x)
        pool_name_list.append(name)
    for name_1 in pool_name_list:
        name_1.result()
    print('全部结束开始保存本地')
    with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
        json.dump(big_list, fw)
    print('保存完毕')

if __name__ == '__main__':
    run(1)

原文地址:https://www.cnblogs.com/pythonywy/p/12545614.html