python 爬虫爬取B站api接口返回的json数据,分页存储csv以及下载图片

接口直接返回的是json数据格式,那就不用去findall各种class了直接处理json数据保存即可

Request URL: https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=17&keyword=&order=pubdate&jsonp=jsonp
Request Method: GET
Status Code: 200 
Remote Address: 123.6.7.66:443
Referrer Policy: no-referrer-when-downgrade
access-control-allow-credentials: true
access-control-allow-headers: Origin,No-Cache,X-Requested-With,If-Modified-Since,Pragma,Last-Modified,Cache-Control,Expires,Content-Type,Access-Control-Allow-Credentials,DNT,X-CustomHeader,Keep-Alive,User-Agent,X-Cache-Webcdn
access-control-allow-methods: GET,POST,PUT,DELETE
access-control-allow-origin: https://space.bilibili.com
bili-status-code: 0
bili-trace-id: 4fb516b50d619c81
cache-control: no-cache
content-encoding: br
content-type: application/json; charset=utf-8
date: Tue, 23 Nov 2021 05:49:54 GMT
expires: Tue, 23 Nov 2021 05:49:53 GMT
idc: shjd
vary: Origin
x-bili-trace-id: 4fb516b50d619c81
x-cache-webcdn: BYPASS from blzone02
:authority: api.bilibili.com
:method: GET
:path: /x/space/arc/search?mid=390461123&ps=30&tid=0&pn=17&keyword=&order=pubdate&jsonp=jsonp
:scheme: https
accept: application/json, text/plain, */*
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cookie: buvid3=89EFA719-1D0F-BB2E-FE21-6C7BDCE8053B38280infoc; CURRENT_FNVAL=976; _uuid=210E48834-E65E-AD99-7F37-6771109799A8837281infoc; video_page_version=v_old_home_11; blackside_state=1; rpdid=|(k||)R|Y|)k0J'uYJ~um~kR|; PVID=1; innersign=0
origin: https://space.bilibili.com
referer: https://space.bilibili.com/390461123/video?tid=0&page=17&keyword=&order=pubdate
sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "Windows"
sec-fetch-dest: empty
sec-fetch-mode: cors
sec-fetch-site: same-site
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36
mid: 390461123
ps: 30
tid: 0
pn: 17
keyword: 
order: pubdate
jsonp: jsonp

  

 

 

案例一: 单页爬取

from bs4 import BeautifulSoup   #引用BeautifulSoup库
import requests                 #引用requests
import os                       #os
import pandas as pd
import csv
import codecs
import re
import xlwt #excel操作
import time
import json

#https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=2&keyword=&order=pubdate&jsonp=jsonp
url = 'https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=1&keyword=&order=pubdate&jsonp=jsonp'
fake_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}

#访问快手界面
first_request = requests.get(url=url,headers=fake_headers)
first_data =first_request.json() #转成json字符串
# text = first_request.text
# data = json.loads(text)  # str转成json
item = first_data['data']['list']['vlist']  # 从全部数据中取出vlist  liebiao 项

#写模式打开csv文件
csv_obj = open('bilibili.csv', 'w', encoding="utf-8")
#写入一行标题
csv.writer(csv_obj).writerow(["aid", "图片链接", "标题"])

listaid = []
for d in item:
    # plist = d.find('img')['src']
    listaid.append(d['aid'])
    #逐个写入电影信息
    print("===============正在写入id为:%s,的信息===============" %(d['aid']))
    csv.writer(csv_obj).writerow([d['aid'],d['pic'],d['title']])
    print("======aid={0}完成: {1}====".format(d['aid'],'over'))    #将format后面的内容以此填充
#关闭
csv_obj.close()
print("finshed")

从上面url链接看出:

https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=分页页数&keyword=&order=pubdate&jsonp=jsonp

案例二:分页爬取需要的数据保存到csv中,另外下载图片到本地

 

from bs4 import BeautifulSoup   #引用BeautifulSoup库
import requests                 #引用requests
import os                       #os
import pandas as pd
import csv
import codecs
import re
import xlwt #excel操作
import time
import json

#通用的爬取方法
def scrape_api(url):
    fake_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    response=requests.get(url=url,headers=fake_headers)
    # text = first_request.text
    # data = json.loads(text)  # str转成json
    return response.json()#读取接口返回的json信息,转化成json字符串

#通用的分页方法
def scrape_page(page):
    #https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=2&keyword=&order=pubdate&jsonp=jsonp
    url = 'https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn={page}&keyword=&order=pubdate&jsonp=jsonp'.format(page=page)
    return scrape_api(url)

#通用的csv表格
def scrpe_csv(item):
    # #写模式打开csv文件
    csv_obj = open('bilibili.csv', 'a+', encoding="utf-8")
    # #写入一行标题
    csv.writer(csv_obj).writerow(["aid", "图片链接", "标题"])
    listaid = []
    for d in item:
        listaid.append(d['aid'])
        #逐个写入电影信息
        print("===============正在写入id为:%s,的信息===============" %(d['aid']))
        csv.writer(csv_obj).writerow([d['aid'],d['pic'],d['title']])
        #print("======aid={0}完成: {1}====".format(d['aid'],'over'))    #将format后面的内容以此填充

    #关闭
    csv_obj.close()
    print("finshed")

# w:以写方式打开,
# a:以追加模式打开 (从 EOF 开始, 必要时创建新文件)
# r+:以读写模式打开
# w+:以读写模式打开 (参见 w )
# a+:以读写模式打开 (参见 a )
# rb:以二进制读模式打开
# wb:以二进制写模式打开 (参见 w )
# ab:以二进制追加模式打开 (参见 a )
# rb+:以二进制读写模式打开 (参见 r+ )
# wb+:以二进制读写模式打开 (参见 w+ )
# ab+:以二进制读写模式打开 (参见 a+ )

#下载图片到本地
def download_img(item):
    pic_l = []#把所有图片地址放到这个数组里头
    for dd in item:
        pic_l.append(dd['pic'])
    
    if not os.path.exists(r'picture'):
        os.mkdir(r'picture')

    for i in pic_l:
        #i == http://i0.hdslb.com/bfs/archive/c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg
        pic = requests.get(i)
        #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list):
        p_name=i.split('/')
        #['http:', '', 'i0.hdslb.com', 'bfs', 'archive', 'c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg']
        imgadres = p_name[5] #c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg
        print(imgadres)
        with open('picture\\'+imgadres,'wb') as f:
             f.write(pic.content)
    print("imgdown_finshed")

#挨个调用方法
def datacsv():
    pages=28 #总页数
    data = []
    for page in range(1,pages):
        print("===========当前为第%s页============="%(page))
        indexdata = scrape_page(page)
        allres=indexdata.get('data')
        item = allres.get('list').get('vlist')# 从全部数据中取出vlist  liebiao 项
        scrpe_csv(item)   #csv表格
        time.sleep(1)
        download_img(item)     #图片下载
        time.sleep(1)


if __name__=='__main__':
    datacsv()
原文地址:https://www.cnblogs.com/yszr/p/15593278.html