抓取b站视频的带有数字的评论

import requests
import re
import os
import sys
import json

aid_list=[]
info_list =[]
title_list = []

def getAllAVList(mid, size, page):
	for n in range(1, page+1):
		url='http://space.bilibili.com/ajax/member/getSubmitVideos?mid='+str(mid)+'&pagesize='+str(size)+'&page='+str(n)
		r=requests.get(url)
		text=r.text
		#print(text.encode('utf-8').decode('unicode_escape'))
		#{"status":true,"data":{"tlist":{"4":{"tid":4,"count":861,"name":"游戏"}},"vlist":[{"comment":200,"typeid":17,"play":24884,"pic":"//i2.hdslb.com/bfs/archive/da1faeb8f3b08693cd440e1c5dfe75b2f612d407.jpg","subtitle":"","description":"啦啦啦","copyright":"","title":"【风笑试玩】在太空捡垃圾丨Space Scavenger 直播试玩","review":0,"author":"逆风笑","mid":2019740,"is_union_video":0,"created":1592293193,"length":"17:16","video_review":316,"is_pay":0,"favorites":355,"aid":626106116,"is_steins_gate":0,"hide_click":false}],"count":861,"pages":861}}
		json_text=json.loads(text)
		for item in json_text['data']['vlist']:
			aid_list.append(item['aid'])
			title_list.append(item['title'])
	print(aid_list)

def getAllCommentList(item):
	info_list.append('begin %s'%title_list[aid_list.index(item)])
	print('begin %s'%title_list[aid_list.index(item)])
	url='http://api.bilibili.com/x/reply?type=1&oid='+str(item)+'&pn=1&nohot=1&sort=0'
	r=requests.get(url)
	numtext=r.text
	json_text=json.loads(numtext)
	commentsNum=json_text['data']['page']['count']
	page=commentsNum//20+2
	for n in range(1, page):
		url='https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn='+str(n)+'&type=1&oid='+str(item)+'&sort=1&nohot=1'
		req=requests.get(url)
		text=req.text
		json_text_list=json.loads(text)
		#print(json_text_list)
		for i in json_text_list['data']['replies']:
			# info_list.append([i['member']['uname'], i['content']['message']])
			if re.findall(r'^[u4E00-u9FA5A-Za-z0-9]*[0-9][u4E00-u9FA5A-Za-z0-9]*$', i['content']['message']):
				info_list.append(str(i['member']['uname']+': '+i['content']['message']))
		# if input('continue, yes or no?')=='y':
		# 	continue
		# else:
		# 	break

def saveTxt(filename, filecontent):
	filename=str(filename)+'.txt'
	with open(filename, 'w', encoding='utf-8') as txt:
		for content in filecontent:
			# txt.write(content[0]+' '+content[1].replace('
', '')+'

')
			txt.write(content+'
')
			#print('文件写入中')

if __name__ == '__main__':
	getAllAVList(2019740, 1, 50)
	for item in aid_list:
		# info_list.clear()
		getAllCommentList(item)
	saveTxt('abc', info_list)

程序的主要思路是借助b站的api进行数据的提取。首先,流程是视频信息接口→视频id→评论接口的评论数量→页数→访问评论字符串→通过正则表达式筛选出含有数字的评论,写在文件中。
json.loads函数将字符串转成字典格式。
遇见字符串里有/uxxxx的字符(utf-8字符编码),想转成其原本的意思,使用string.encode('utf-8').decode('unicode_escape')
中文、字母和数字的正则表示法是[u4E00-u9FA5A-Za-z0-9]

参考链接:

1

爬虫如何抓取b站评论,弹幕等内容? - 肥肥杨的回答 - 知乎

2

python: 关于解决'u'开头的字符串转中文的方法

原文地址:https://www.cnblogs.com/tellw/p/13158653.html