scrapy+lxml.etree爬取百度贴吧

分析:首先通过scrapy内置的xpath提取内容,发现为空,所以不行咯
   采用正则re匹配出所有的<li>标签,也就是需要提取的所有内容
     在把li标签通过resultTree = lxml.etree.HTML(articleBody),变成'lxml.etree._Element'
   在通过resultTree.xpath()进行提取
   注意  此时的xpath与scrapy的xpath是不一样的

# -*- coding: utf-8 -*-
import scrapy
from ..settings import MAX_PAGE
from ..items import TiebaBaiduItem
import re
import lxml.html
import lxml.etree
import json


class TiebaSpider(scrapy.Spider):

    name = 'tieba'
    allowed_domains = ['tieba.baidu.com']
    start_urls = ['https://tieba.baidu.com/f?kw=%E9%83%91%E5%AE%B8&ie=utf-8&pn={}'.format(str(page * 50)) for page in range(MAX_PAGE + 1)]

    def parse(self, response):

        # 关键 是正则匹配出 那一段需要有用的 html代码 如下  就是把那一部分 <li>标签全取出来
        articleBodyRe = re.search('<ul id="thread_list" class="threadlist_bright j_threadlist_bright">(.*?)<div class="thread_list_bottom clearfix">', response.text, re.DOTALL)
        articleBody = ''
        if articleBodyRe:
            articleBody = articleBodyRe.group(1)
        # 通过lxml.etree.HTML(articleBody) 变成html对象  再利用xpath进行提取
        # 此时的xpath与scrapy使用的xpath略有不同
        # 这是lxml模块中xpath的使用方式
        resultTree = lxml.etree.HTML(articleBody)

        articleList = resultTree.xpath('//li[contains(@class,"j_thread_list")]')
        for articleElem in articleList:
            articleInfo = {}
            data_field = articleElem.xpath("@data-field")[0]
            dataFieldJson = json.loads(data_field)
            articleInfo['id'] = dataFieldJson['id']
            articleInfo['author'] = dataFieldJson['author_name']
            articleInfo['title'] = articleElem.xpath(".//div[@class='t_con cleafix']//a/@title")[0]
            articleInfo['href'] = 
            articleElem.xpath(".//div[@class='t_con cleafix']//a/@href")[0]
            yield response.follow(
                url = articleInfo['href'] + "?see_lz=1",
                meta={'dont_redirect': True, 'articleInfo': articleInfo},
                callback = self.parseArticleDetail,
                errback = self.errorHandle
            )

    def parseArticleDetail(self, response):
        print(
            f"parseArticleDetail: statusCode = {response.status}, url = {response.url}")
        contentLst = response.xpath(
            "//div[contains(@id, 'post_content')]//text()").extract()
        imgHrefLst = response.xpath(
            "//div[contains(@id, 'post_content')]//img/@src").extract()
        dateLst = response.xpath(
            "//div[contains(@class, 'post_content_firstfloor')]//span[@class='tail-info']/text()").extract()
        content = ''
        for contentElem in contentLst:
            content += contentElem.replace('
', ',').replace(" ", '').strip()
            content += ', '
        print(f"content = {content}")
        print(f"imgHrefLst = {imgHrefLst}")
        articleInfo = response.meta['articleInfo']
        articleItem = TiebaBaiduItem()
        articleItem['item_type'] = 'articleDetail'
        articleItem['_id'] = articleInfo['id']
        articleItem['title'] = articleInfo['title']
        articleItem['author'] = articleInfo['author']
        articleItem['content'] = content
        articleItem['fromUrl'] = response.url
        articleItem['picHrefLst'] = imgHrefLst
        articleItem['date'] = dateLst[1]
        yield articleItem

    # 请求错误处理:可以打印,写文件,或者写到数据库中
    def errorHandle(self, failure):
        print(f"request error: {failure.value.response}")



   

原文地址:https://www.cnblogs.com/hyxailj/p/9156547.html