爬虫小程序之爬取百度贴吧图片

利用Python第三方库请求库requests和解析库lxml等工具爬取百度贴吧任意贴吧名里的所有图片:

要求如下:

  1、.编程范式------面向对象

  2、采取简单的反反爬措施:如请求时间不宜过于频繁、请求头中的User-Agent要隐藏爬取工具且随机随机生成User-Agent规避反爬

  3、只爬取吧主发布的图片,其他图片禁止爬取

代码如下:

  

import requests
from lxml import etree
import os
import time
from fake_useragent import UserAgent
import warnings
import random

warnings.filterwarnings('ignore')


class BaiduSpider(object):
    def __init__(self, keyword, page_number):
        self.url = 'http://tieba.baidu.com/'
        self.useragent = UserAgent()
        self.headers = {'User-Agent': self.useragent.random}
        self.keyword = keyword
        self.page_number = page_number

    # 获取帖子链接
    def get_tlink(self, data):
        res = requests.get(self.url, headers=self.headers, params=data)
        res.encoding = 'utf-8'
        html = res.text
        html = html.replace(r"<!--", '').replace(r"-->", '')
        # print(html)
        parse_html = etree.HTML(html)
        t_list = parse_html.xpath(
            '//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div//a/@href')
        # print(t_list)
        for t in t_list:
            # 拼接每个帖子的链接
            t_link = 'http://tieba.baidu.com' + t
            # 向帖子链接发请求,获取图片链接,向图片链接发请求,保存图片到本地
            # print(t_link)
            self.get_ilink(t_link)

    # 提取图片链接
    def get_ilink(self, t_link):
        res = requests.get(t_link, headers=self.headers)
        res.encoding = 'utf-8'
        html = res.text
        parse_html = etree.HTML(html)
        i_list = parse_html.xpath(
            '//div[@class="d_post_content_main d_post_content_firstfloor"]//div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src')
        print(i_list)
        for i in i_list:
            html = requests.get(i, heasers=self.headers).content
            self.write_image(html, i)

    # 保存图片
    def write_image(self, html, i):
        filename = './' + self.keyword + '/' + i[-10:]
        with open(filename, 'wb') as f:
            f.write(html)

    def main(self):
        if os.path.exists(self.keyword):
            os.remove(self.keyword)
        for i in range(1, self.page_number + 1):
            data = {
                'kw': self.keyword,
                'pn': str((i - 1) * 50)
            }
            self.get_tlink(data)
            print('第%d页下载完毕' % i)
            time.sleep(random.randint(1, 10))


if __name__ == "__main__":
    spider = BaiduSpider('高考吧', 1)
    spider.main()

  

原文地址:https://www.cnblogs.com/yuxiangyang/p/11093417.html