爬取贴吧的例子(摘录)

import requests
class Spider:
	def __init__(self,name):
		self.name=name
		self.url_temp="https://tieba.baidu.com/f?kw="+name+"&ie=utf-8&pn={}"
		self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/53
7.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
	def get_url_list(self):
		return [self.url_temp.format(i*50) for i in range(3)]
	
	def parse_url(self,url):
		response=requests.get(url,headers=self.headers)
		return response

	def save_html_str(self,html_str,page_num):
		file_path="/tmp/tieba/{}吧_第{}页".format(self.name,page_num)
		with open(file_path,"w",encoding="utf-8") as f:
			f.write(html_str)
	
	def run(self):
		url_list=self.get_url_list()
		for url in url_list:
			html_str=self.parse_url(url).content.decode()
			page_num=url_list.index(url)+1
			self.save_html_str(html_str,page_num)

def main():
	name=input("请输入要爬取的贴吧:")
	tieba_spider=Spider(name)
	tieba_spider.run()


if __name__ == "__main__":
	main()
原文地址:https://www.cnblogs.com/Haihong72H/p/13891657.html