Python爬虫_百度贴吧

# 本爬虫为爬取百度贴吧并存储HTML

import
requests class TiebaSpider: def __init__(self, tieba_name): self.tieba_name = tieba_name self.url = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"} def get_url_list(self): #构造url列表 url_list = [self.url.format(i*50) for i in range(1000)] return url_list def parse_url(self, url): #发送请求 获取响应 res = requests.get(url, headers=self.headers) return res.content.decode() def save_html(self, html_str, page_num): #保存 file_path = "{}-第{}页.html".format(self.tieba_name, page_num) with open(file_path, "w", encoding="utf-8") as f: f.write(html_str) def run(self): # 实现主要逻辑 # 1 构造url列表 url_list = self.get_url_list() # 2 遍历 发送请求 获取响应 for url in url_list: html_str = self.parse_url(url) # 3 保存 page_num = url_list.index(url)+1 self.save_html(html_str, page_num) print(url) if __name__ == "__main__": tieba_spider = TiebaSpider("lol") tieba_spider.run()
原文地址:https://www.cnblogs.com/waterr/p/13893578.html