案例3 百度贴吧爬虫

import requests


class TiebaSpider:
    """贴吧爬虫"""
    def __init__(self, keywords):
        # 贴吧名称
        self.kw = keywords
        # 目标地址
        self.url = "https://tieba.baidu.com/f?ie=utf-8"
        # 伪装请求
        self.headers = {
            "User-Agent": "请求头"
        }

    def get_data(self, start_page, end_page):
        """
        采集数据
        :param start_page: 采集数据的起始页面
        :param end_page: 采集数据的结束页面
        :return: 返回采集结果
        """
        for i in range(start_page, end_page + 1):
            # 设置参数
            ps = {"kw": self.kw, "pn": ((i-1) * 50)}
            # 发送请求获取数据: get请求后拼接参数数据
            response = requests.get(self.url, params=ps, headers=self.headers)
            # 存储数据
            file_name = f"tieba_{i}.html"
            self._save_data(file_name, response.content)

    def _save_data(self, file_name, content):
        """存储数据"""
        with open(f"data/{file_name}", mode="wb") as file:
            file.write(content)


if __name__ == "__main__":
    # 创建爬虫对象
    tb = TiebaSpider("王者荣耀")
    # 获取数据
    tb.get_data(1, 2)
原文地址:https://www.cnblogs.com/duxiangjie/p/13924897.html