Python-爬取微博信息

# -*- coding: utf-8 -*-
import requests, re
import time
import os
import csv
import sys
import importlib
from fake_useragent import UserAgent

importlib.reload(sys)
class WeiBoSpider():
    def __init__(self, page):
        self.path = os.getcwd() + "/weibo.csv"
        self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig")
        self.writer = csv.writer(self.csvfile)
        # csv头部
        self.writer.writerow(('话题链接', '话题内容', '楼主ID', '楼主昵称', '楼主性别', '发布日期',
                 '发布时间', '转发量', '评论量', '点赞量', '评论者ID', '评论者昵称',
                 '评论者性别', '评论日期', '评论时间', '评论内容'))
        self.headers = {
    'Cookie': '_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431',
    'Referer': 'https://m.weibo.cn/detail/4312409864846621',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest'
}
        self.comments_ID = []
        self.page = page

    def get_title_id(self):
        # 获取内容中的id列表
        for page in range(1, self.page):
            self.headers = {
                "User-Agent": UserAgent().chrome
            }
            time.sleep(1)
            api_url = 'https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=' + str(page)
            rep = requests.get(url=api_url, headers=self.headers)
            # 获取ID值并写入列表comment_ID中
            for json in rep.json()['data']['statuses']:
                comment_ID = json['id']
                self.comments_ID.append(comment_ID)

    def spider_title(self, id):
        """爬取战役情每个主题的详情页面"""
        try:
            title_url = 'https://m.weibo.cn/detail/' + str(id)
            html_text = requests.get(url=title_url, headers=self.headers).text
            # 内容
            title = re.findall('.*?"text": "(.*?)",.*?', html_text)[0]
            # 去掉title中的html标签
            text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', title)
            # 用户id
            user_id = re.findall('.*?"id": "(.*?)",.*?', html_text)[0]
            # 用户昵称
            user_nicname = re.findall('.*?"screen_name": "(.*?)",.*?', html_text)[0]
            # 性别
            user_gender = re.findall('.*?"gender": "(.*?)",.*?', html_text)[0]
            # 发布时间
            created_title_time = re.findall('.*?"created_at": "(.*?)",.*?', html_text)[0].split(" ")
            # 日期
            if 'Mar' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '03', created_title_time[2])
            elif 'Feb' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '02', created_title_time[2])
            elif 'Jan' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '01', created_title_time[2])
            else:
                pass
            # 发布时间
            add_title_time = created_title_time[3]
            # 转发量
            reposts_count = re.findall('.*?"reposts_count": (.*?),.*?', html_text)[0]
            # 评论量
            comments_count = re.findall('.*?"comments_count": (.*?),.*?', html_text)[0]
            # 点赞量
            attitudes_count = re.findall('.*?"attitudes_count": (.*?),.*?', html_text)[0]

            comment_count = int(int(comments_count) / 20)  # 每个ajax一次加载20条数据
            position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                         add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ")
            # 写入数据
            print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                         add_title_time, reposts_count, comments_count, attitudes_count)
            self.writer.writerow((position1))
            return comment_count
        except:
            pass


    def get_page(self, id, max_id, id_type):
        # 抓取评论信息
        params = {
            'max_id': max_id,
            'max_id_type': id_type
        }
        url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id'.format(id, id )
        try:
            r = requests.get(url, params=params, headers=self.headers)
            if r.status_code == 200:
                return r.json()
        except requests.ConnectionError as e:
            print('error', e.args)
            pass

    def parse_page(self, jsondata):
        if jsondata:
            items = jsondata.get('data')
            item_max_id = {}
            item_max_id['max_id'] = items['max_id']
            item_max_id['max_id_type'] = items['max_id_type']
            return item_max_id

    def write_csv(self, jsondata):
        for json in jsondata['data']['data']:
            # 用户ID
            user_id = json['user']['id']
            # 用户昵称
            user_name = json['user']['screen_name']
            # 用户性别,m表示男性,表示女性
            user_gender = json['user']['gender']
            # 获取评论
            comments_text = json['text']
            comment_text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', comments_text)  # 正则匹配掉html标签
            # 评论时间
            created_times = json['created_at'].split(' ')
            if 'Feb' in created_times:
                created_YMD = "{}/{}/{}".format(created_times[-1], '02', created_times[2])
            elif 'Jan' in created_times:
                created_YMD = "{}/{}/{}".format(created_times[-1], '01', created_times[2])
            else:
                print('该时间不在疫情范围内,估计数据有误!')
                pass
            created_time = created_times[3]  # 评论时间时分秒
            position2 = (
                " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD,
                created_time,
                comment_text)
            self.writer.writerow((position2))  # 写入数据

    def main(self):
        self.get_title_id()
        count_title = len(self.comments_ID)
        for count, comment_ID in enumerate(self.comments_ID):
            print("正在爬取第%s个话题,一共找到个%s话题需要爬取" % (count + 1, count_title))
            # maxPage获取返回的最大评论数量
            maxPage = self.spider_title(comment_ID)
            m_id = 0
            id_type = 0
            if maxPage != 0:  # 小于20条评论的不需要循环
                try:
                    # 用评论数量控制循环
                    for page in range(0, maxPage):
                        # 自定义函数-抓取网页评论信息
                        jsondata = self.get_page(comment_ID, m_id, id_type)
                        # 自定义函数-写入CSV文件
                        self.write_csv(jsondata)
                        # 自定义函数-获取评论item最大值
                        results = self.parse_page(jsondata)
                        time.sleep(1)
                        m_id = results['max_id']
                        id_type = results['max_id_type']
                except:
                    pass
            print("--------------------------分隔符---------------------------")
        self.csvfile.close()


if __name__ == '__main__':
    startTime = time.time()
    spider = WeiBoSpider(15)
    spider.main()
    endTime = time.time()
    useTime = (endTime - startTime) / 60
    print("该次所获的信息一共使用%s分钟" % useTime)
原文地址:https://www.cnblogs.com/zhouzetian/p/12569176.html