python 爬取百度贴吧

爬虫思路如下:

1.向起始url发起get请求得到响应

2.从(1)的响应中使用正则表达式提取每个贴吧标题和对应的URL,发送请求,获取响应

3.在(2)的响应中使用正则表达式提取每个img的URL,发送请求,获取响应。

4.将(3)的响应内容保存为图片

5.从(1)的响应中使用正则表达式提取下一页的URL,如果能提取到,则重复上述步骤:如果提取不到,则爬虫结束

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/8/24 22:28
# @Author : Lhtester
# @Site : 
# @File : tieba.py
# @Software: PyCharm

import requests
import re
import time
import random

class TiebaSpider:

    '''贴吧爬虫'''
    def __init__(self):
        self.kw = input('关键字》')
        self.base_url = 'https://tieba.baidu.com/f'
        self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
        self.page_num =1
        self.title = ''

    def parse_text(self ,url, params=None):
        '''发送请求,获取响应内容'''
        #休眠,避免被对方反爬检测到
        time.sleep(random.randint(1 , 5) )
        req = requests.get(url, headers = self.headers, params=params)
        # print('req.text:',req.text)
        return req.text
    def parse_byte(self, url, params =None):
        '''发送请求,获取响应内容'''
        time.sleep(random.random()* 2 )
        req = requests.get(url, headers = self.headers, params=params)
        print('req.text:',req.content)
        return req.content

    def page(self, content):
        '''解析每一页'''
        print('第{}页爬取中。。。'.format(self.page_num))
        self.page_num +=1
        url_title = re.findall(
            r'<a rel="noreferrer" href="(/p/d+?)" title="(.+?)" target="_blank" class="j_th_tit ">(.+?)</a>', content
        )#因为存在变化数据是三个,返回一个列表
        for url, title ,title2 in url_title:#对应循环也是三个值
            self.title =title
            self.detail('https://tieba.baidu.com/' + url)

            #保存标题
            self.save_title()

        #判断下一页
        next_url = re.findall(r'<a href="(.*?)" .*?>下一页&gt;</a>', content)
        if self.page_num==10:#爬取10页数据
            print('爬虫到第10页,结束')
        elif next_url:
            print('next_url:',next_url)
            next_url = 'https:' + next_url[0]
            content = self.parse_text(url=next_url)
            self.page(content)
        else:
            print('爬虫结束')
    def detail(self, url):
        '''每一个帖子的详情'''
        content = self.parse_text(url=url)
        urls = re.findall(r'<img class="BDE_Image" pic_type="(d)" .*? src="(.*?)" .*?>',content)#加pic_type原因:因页面有很多image标签,只想保存用户上传的图片,而非百度系统图片,根据pic_type进行赛选
        # print('content:',type(content),content)
        # print('urls:',urls)
        for number,url in urls:
            self.sava_img(url=url)

    def save_title(self):
        '''保存帖子的标题'''
        print('开始帖子的标题')

        with open('../image/tieba_{}.txt'.format(self.kw),'a',encoding='utf-8') as file:
            file.write(self.title)
            file.write('
')

    def sava_img(self, url ):
        '''保存图片'''
        content = self.parse_byte(url=url)
        image_path = '{}_{}'.format(self.title,url[url.rfind('/') +1:])
        #windows不能保存文件名存在/:*?"<>|字符。对文件进行替换
        image_path = image_path.replace('?','')
        image_path = image_path.replace('/','')
        image_path = image_path.replace(':','')
        image_path = image_path.replace('*','')
        image_path = image_path.replace('"','')
        image_path = image_path.replace('<','')
        image_path = image_path.replace('>','')
        image_path = image_path.replace('\','')#学过编程的人都应该知道,在C里面,输出字符串时,如果想输出一个换行,那就要加上'
'这个标志,类似的,输出一个TAB,就加上' 	',也就是说,反斜杠("")这个符号会把跟在它后面的字符结合起来转义成其它字符。根据这个原理,如果想输出双引号('"'),就需要输入' "',这样才会将包含了双引号的字符串正确的写入内存中。那么如果想输入一个反斜杠呢?很简单,只要敲'\'就可以了。
        image_path= '../image/{}'.format(image_path)#加上windows路径


        print('开始保存图片')
        with open (image_path,'wb') as file:
            file.write(content)

    def start(self):
        '''开始爬虫'''
        print('爬虫开始')
        content =self.parse_text(url=self.base_url, params={'kw':self.kw,'ie':'utf-8','fr':'search'})
        self.page(content)

if __name__ == '__main__':
    spider =TiebaSpider()
    spider.start()
原文地址:https://www.cnblogs.com/anhao-world/p/15183128.html