爬虫之增量式爬虫

- 增量式爬虫
    - 概念:监测网站数据更新的情况,爬取最新更新出来的数据!
    - 实现增量式?
        - 去重!
- 电影网站:爬取的数据没有在同一张页面中
    - 需要对每一部电影详情页的url做记录
    - 下载执行程序的时候,需要将即将被爬取电影详情页的url做记录监测
    - 电影详情页的url记录可以存储到Set或者redis的Set
    - 爬取到的所有的电影数据可以存储到redis


- 对一个url对应页面中的数据做监测
    - 数据指纹:对一组数据制定的一个唯一标识

对于不在同一页面的数据的案例

目录文件:

movie.py文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zls_movie_Pro.items import ZlsMovieProItem
class MovieSpider(CrawlSpider):
    conn = Redis(host='127.0.0.1',port=6379)
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.4567kan.com/index.php/vod/show/id/5.html']

    rules = (
        Rule(LinkExtractor(allow=r'/index.php/vod/show/id/5/page/d+.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        #电影名称和详情页的url
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            name = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first()

            item = ZlsMovieProItem()
            item['name'] = name

            detail_url = 'http://www.4567kan.com'+li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first()
            #ex == 1 :字符串插入成功   ex == 0 插入的字符串重复了,利用redis的set集合去重
            ex = self.conn.sadd('movie_detail_urls',detail_url)

            if ex == 1:
                print('有最新更新的数据可爬......')
                yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item})
            else:
                print('暂无数据更新!')
    def parse_detail(self,response):
        movie_desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        item = response.meta['item']
        item['desc'] = movie_desc

        yield item

 items.py 文件

import scrapy


class ZlsMovieProItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    desc = scrapy.Field()
    # pass

 pipelines.py文件,将数据存在redis数据库中

class ZlsMovieProPipeline(object):
    def process_item(self, item, spider):
        conn = spider.conn
        conn.lpush('movie_data',item)
        return item

数据在同一张页面中

- 需求:爬取糗事百科中的段子和作者数据。

爬虫文件:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from incrementByDataPro.items import IncrementbydataproItem
from redis import Redis
import hashlib
class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.qiushibaike.com/text/']

    rules = (
        Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
    )
    #创建redis链接对象
    conn = Redis(host='127.0.0.1',port=6379)
    def parse_item(self, response):
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            item = IncrementbydataproItem()
            item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
            item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()

            #将解析到的数据值生成一个唯一的标识进行redis存储
            source = item['author']+item['content']
 source_id = hashlib.sha256(source.encode()).hexdigest()
#将解析内容的唯一表示存储到redis的data_id中
            ex = self.conn.sadd('data_id',source_id)

            if ex == 1:
                print('该条数据没有爬取过,可以爬取......')
                yield item
            else:
                print('该条数据已经爬取过了,不需要再次爬取了!!!')

管道文件:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from redis import Redis
class IncrementbydataproPipeline(object):
    conn = None

    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6379)

    def process_item(self, item, spider):
        dic = {
            'author': item['author'],
            'content': item['content']
        }
        # print(dic)
        self.conn.lpush('qiubaiData', dic)
        return item
原文地址:https://www.cnblogs.com/lulin9501/p/11341279.html