b站排行榜-爬虫

import requests
from lxml import etree
import re
import time
import json
import threading
import urllib3
urllib3.disable_warnings()


url = "https://www.bilibili.com/ranking/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}


#所有频道数据
c_data = {}
#获取频道信息
def channels_work():
    res = requests.get(url=url,headers=headers).text
    #匹配有效数据
    data = re.findall('"channels":(.*?),"showTypes":',res)
    #解析所有的频道和tid信息
    channels = json.loads(data[0])
    print("所有分类信息爬取ok")
    print(channels)
    return channels


#获取频道UP主个人首页
def channels_detail(c_url,c_name,all_up):
    res = requests.get(url=c_url,headers=headers).text
    tree = etree.HTML(res)
    all_video = tree.xpath("//ul[@class='rank-list']/li")
    for i in all_video:
        #获取视频播放量和评论数和up主
        up_home = "https:"+i.xpath(".//div[@class='detail']/a/@href")[0]
        all_up.append(up_home)
    print(c_name+"排行榜数据爬取ok")


def up_detail(url,proxies):
    up = {}
    up['video'] = []

    #解析up的mid
    lis = url.split("/")
    up['up_mid'] = lis[-1]
    print("开始爬取mid为"+str(up['up_mid'])+"的UP主的信息")

    #up主个人信息接口
    url1 = 'https://api.bilibili.com/x/space/acc/info?mid='+str(up['up_mid'])
    res = requests.get(url=url1, headers=headers,proxies=proxies,verify=False).text
    up_d = json.loads(res)
    up['up_name'] = up_d["data"]["name"]
    up['up_face'] = up_d["data"]["face"]
    up['up_sex'] = up_d["data"]["sex"]
    up['up_sign'] = up_d["data"]["sign"]
    up['up_level'] = up_d["data"]["level"]
    up['up_fans_badge'] = up_d["data"]["fans_badge"]

    #关注，被关注接口
    # {"code": 0, "message": "0", "ttl": 1,"data": {"mid": 18775476, "following": 128, "whisper": 0, "black": 0, "follower": 519397}}
    url2 = 'https://api.bilibili.com/x/relation/stat?vmid=' + str(up['up_mid'])
    res = requests.get(url=url2, headers=headers,proxies=proxies,verify=False).text
    up_d = json.loads(res)
    up['up_following'] = up_d['data']['following']
    up['up_follower'] = up_d['data']['follower']

    #播放量，阅读数
    # {"code": 0, "message": "0", "ttl": 1,"data": {"archive": {"view": 37989388}, "article": {"view": 560}, "likes": 1688691}}
    url3 = 'https://api.bilibili.com/x/space/upstat?mid=' + str(up['up_mid'])
    res = requests.get(url=url3, headers=headers,proxies=proxies,verify=False).text
    up_d = json.loads(res)
    up['up_archive'] = up_d['data']['archive']['view']
    up['up_likes'] = up_d['data']['likes']
    up['up_article'] = up_d['data']['article']['view']

    #充电数接口
    # {"code":0,"data":{"display_num":0,"count":13,"total_count":994,"list":...
    url4 = 'https://elec.bilibili.com/api/query.rank.do?mid=' + str(up['up_mid'])
    res = requests.get(url=url4, headers=headers,proxies=proxies,verify=False).text
    up_d = json.loads(res)
    try:
        up['up_total_count'] = up_d['data']['total_count']
    except:
        up['up_total_count'] = 0

    #视频接口
    url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=1&pn=1'
    res = requests.get(url=url5, headers=headers,proxies=proxies,verify=False).text
    up_d = json.loads(res)
    count = up_d['data']['page']['count']
    up = get_video(count,up)

    print(up)


def get_video(count,up):
    pn = 1
    while count > 0:
        url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=100&pn='+str(pn)
        res = requests.get(url=url5, headers=headers).text
        up_d = json.loads(res)
        v={}
        for video in up_d['data']['list']['vlist']:
            v['title'] = video['title']
            v['pic_url'] = video['pic']
            v['comment'] = video['comment']
            v['video_review'] = video['video_review']
            v['created'] = video['created']
            up['video'].append(v)
        pn += 1
        count -= 100
    return up

#爬取可用代理并爬取信息
def ip_run(i,ts2):
    url = 'https://www.xicidaili.com/nn/'
    ip_response = requests.get(url=url,headers=headers).text

    ips = re.findall("<td>(d+.d+.d+.d+)</td>", ip_response, re.S)
    ports = re.findall("<td>(d+)</td>", ip_response, re.S)

    for ip in(zip(ips,ports)):
        proxies = {
            "http":"http://"+ip[0]+":"+ip[1],
            "https":"http://"+ip[0]+":"+ip[1],
        }
        try:
            res = requests.get('https://space.bilibili.com/337312411',proxies=proxies, timeout=3)
            print("ip能使用")
            #如果能使用，使用此ip代理
            print("开始爬取url为" + i + "的up主详细信息")
            t = threading.Thread(target=up_detail, args=(i,proxies))
            t.start()
            time.sleep(3)
            ts2.append(t)
            break
        except Exception as e:
            print("ip不能使用")

if __name__ == '__main__':
    #获取频道信息
    channels = channels_work()

    #获取UP主主页
    all_up = []
    ts = []
    ts2 = []
    for c in channels:
        #拼接频道url
        c_url = url+"all/"+str(c['tid'])+"/1/3"
        c_data[c["name"]] = []
        t = threading.Thread(target=channels_detail,args=(c_url,c['name'],all_up))
        t.start()
        ts.append(t)

    for t in ts:
        t.join()

    for i in all_up:
        ip_run(i,ts2)

    for t in ts2:
        t.join()

    print("爬取所有数据完成")