爬取“我们是冠军”的评论

微博热搜第一名；B站人气超过3.5亿，满屏弹幕；腾讯视频超过600万人观看；央视新闻也发微博祝贺EDG；今天用python来爬下B站“我们是冠军”这个视频的评论并做些可视化。获取呐喊的正确姿势。

评论爬取代码：

 1 import csv
 2 import pprint
 3 import random
 4 import time
 5 import requests
 6 import openpyxl as opx
 7 import json
 8 
 9 f = open('我们是冠军.csv', mode='a', encoding='utf-8-sig', newline='')
10 csvWriter = csv.DictWriter(f, fieldnames=[
11     '评论人',
12     '性别',
13     '点赞数',
14     '评论时间',
15     '评论内容',
16 ])
17 csvWriter.writeheader() # 写入头
18 startStampTime = int(time.time() * 1000)
19 # # 新建excel文档
20 # wb = opx.Workbook()
21 # ws = wb.create_sheet(index=0)
22 #
23 # # 先写入表头
24 # ws.cell(row=1, column=1, value='评论人')
25 # ws.cell(row=1, column=2, value='性别')
26 # ws.cell(row=1, column=3, value='点赞数')
27 # ws.cell(row=1, column=4, value='评论时间')
28 # ws.cell(row=1, column=5, value='评论内容')
29 
30 headers = {
31     "cookie": "_uuid=BE35640F-EB4E-F87D-53F2-7A8FD5D50E3330964infoc; buvid3=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; CURRENT_BLACKGAP=1; CURRENT_QUALITY=0; rpdid=|(u))ku~m)kJ0J'uYJuRRRYmk; CURRENT_FNVAL=976; video_page_version=v_old_home_17; blackside_state=1; LIVE_BUVID=AUTO1516364619569495; sid=bqyo86kv; innersign=1; PVID=2",
32     "referer": "https://www.xxx.com/video/BV12R4y1E7kn?spm_id_from=333.999.0.0",
33     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
34 }
35 # # 初始计数
36 # count = 2
37 for page in range(1, 100 + 1):
38     print(f'====================正在爬取第{page}页的数据====================')
39     time.sleep(random.randint(2,5)) # 随机休眠
40     nextStampTime = int(time.time() * 1000)
41     # 请求的网址
42     url = f'https://api.xxx.com/x/v2/reply/main?callback=jQuery172046940903221511165_{startStampTime}&jsonp=jsonp&next={page}&type=1&oid=336587753&mode=3&plat=1&_={nextStampTime}'
43     # 开始请求数据
44     response = requests.get(url=url, headers=headers)
45     # print(response.text)
46     json_data = json.loads(response.text[42:-1])
47     # 提取我们要的数据
48     data = json_data['data']['replies']
49     print(f'第{page}页包含:' + str(len(data)))
50     for item in data:
51         # pprint.pprint(item)
52         name = item['member']['uname']
53         sex = item['member']['sex']
54         like = item['like']
55         ctime = item.get('ctime')
56         # print(ctime)
57         commenttime = time.strftime('%Y-%m-%d %H:%M', time.localtime(ctime))
58         content = item['content']['message']
59         # print(name, sex, like, commenttime, content, sep=' | ')
60         dit = {
61             '评论人':name,
62             '性别':sex,
63             '点赞数':like,
64             '评论时间':commenttime,
65             '评论内容':content,
66         }
67         print(dit)
68         csvWriter.writerow(dit)
69     break
70 print('数据采集完毕！')

下面来做个词云：

 1 import re
 2 import jieba
 3 import matplotlib.pyplot as plt
 4 import pandas as pd
 5 from wordcloud import WordCloud
 6 # 读取
 7 df = pd.read_csv('我们是冠军.csv')
 8 
 9 # 删除重复记录和NA值
10 df_new = df.drop_duplicates() # 去重
11 df_new = df_new.dropna() # 删除缺失值
12 # print(df_new)
13 STOPWORDS = {"回复", "@", "我", "她", "你", "他", "了", "的", "吧", "吗", "在", "啊", "不", "也", "还", "是",
14              "说", "都", "就", "没", "做", "人", "赵薇", "被", "不是", "现在", "什么", "这", "呢", "知道", "邓", "我们", "他们", "和", "有", "", "",
15             "要", "就是", "但是", "而", "为", "自己", "中", "问题", "一个", "没有", "到", "这个", "并", "对", "点赞", "热词", "系列", "热词系列"}
16 
17 # 取出评论区的词进行分词
18 textList = df_new['评论内容'].value_counts().sort_values().index.tolist()
19 # print(textList)
20 # 将列表转换成字符串
21 strText = ' '.join(textList)
22 # 正则替换一下指定次
23 newTxt = re.sub("A-Z0-9-a-z!\%[]\,。", "", strText)
24 # print(newTxt)
25 words = jieba.lcut(newTxt)
26 
27 # 制作词云
28 wordcloudword = WordCloud(
29     background_color='white',
30     width = 1080,
31     height = 960,
32     # font_path = "../文悦新青年.otf",
33     font_path = 'C:/Windows/Fonts/simhei.ttf',
34     max_words = 150,
35     scale = 10, #清晰度
36     max_font_size = 100,
37     stopwords=STOPWORDS,
38     # mask = img_array, # 可以设置背景图像
39     collocations=False).generate(newTxt)
40 
41 plt.imshow(wordcloudword)
42 plt.axis('off')
43 plt.show()
44 wordcloudword.to_file('wc.png')