爬取我大Bilbil的视频相关数据

 爬取的内容不是那么的多,但也够用了,比葫芦画瓢就能画出来自己想要的,nice

捋下思路:1. 进入页面,开发者工具抓取到自己需要的请求

2. 带上请求头,分析参数

3. 获取自己的目标数据

4. 格式化json数据

5. 抓取自己需要的数据

6.数据保存

# -*- coding: utf-8 -*-
# @Time    : 2019/1/7  10:58
# @Author  : zhangxinxin
# @Email   : 778786617@qq.com
# @Software: PyCharm
import requests
import time
import json
import csv


class BilBil(object):
    def __init__(self):
        self.url = 'https://api.bilibili.com/x/web-interface/newlist'
        self.html = ''
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
            'Referer':'https://www.bilibili.com/v/anime/serial/?spm_id_from=333.334.b_7072696d6172795f6d656e75.8',
            'Cookie':'您的cookie',
            'Host':'api.bilibili.com',
        }
        self.data_list = []

    def get_html(self, params):
        """获取json源码"""
        try:
            self.html = requests.get(url=self.url, params=params, headers=self.headers).text
        except Exception as e:
            print(e)
            print('错误,请重新请求!')

    def parse_html(self):
        """解析json数据"""
        data_dict = json.loads(self.html)
        data_list = data_dict['data']['archives']
        for data in data_list:
            list1 = []
            list1.append(data['aid'])
            list1.append(data['title'])
            list1.append(data['attribute'])
            list1.append(data['duration'])
            list1.append(data['pic'])
            list1.append(data['ctime'])
            self.data_list.append(list1)
        print(self.data_list)

    def save_csv(self):
        """保存数据至csv"""
        # w直接从开始写入,a追加写入
        with open('BilBil.csv', 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['番号', '名字', '观看数', '评论数', '缩略图地址', '发布时间'])
        with open('BilBil.csv', 'a', newline='') as f:
            writer = csv.writer(f)
            for x in self.data_list:
                writer.writerow(x)
                # time.sleep(0.1)
            # 数据写入需要时间,程序结束过快会导致数据写入不全

    def run(self):
        for x in range(0, 10):
            page = x
            params = {
                'rid': '33',
                'type': '0',
                'pn': page,
                'jsonp': 'jsonp',
                '_': '1546830252656'
            }
            self.get_html(params)
            self.parse_html()
            self.save_csv()
            time.sleep(1)
            print('第{}页数据爬取完毕'.format(x + 1))


if __name__ == '__main__':
    s = BilBil()
    s.run()
原文地址:https://www.cnblogs.com/UTF-8-xinxin/p/10240944.html