二十五、scrapy中的去重规则及自定义

settings.py中,虽然没有明确写出来去重方式,但是默认为以下内容

DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_CLASS = False    
JOBDIR = "保存范文记录的日志路径,如:/root/"     # 最终路径为 /root/requests.seen

通过改写以下代码, 并在settings.py中重新写入,以达到替换的目的

DUPEFILTER_CLASS = '替换为自己的去重类规则'
import os
import logging

from scrapy.utils.job import job_dir
from scrapy.utils.request import referer_str, request_fingerprint


class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        self.file = None
        self.fingerprints = set()    # 默认指纹为set类型,可改写
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.file.seek(0)
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        # 不同于from_crawler中的crawler,这里直接是settings
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        # 判断进入scheduler中的Request,是否已经出现,出现返回True,未出现就将其添加并返回False
        fp = self.request_fingerprint(request)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + '
')

    def request_fingerprint(self, request):
        # 该request_fingerprint为定义指纹特征
        # 来自:scrapy.utils.request.request_fingerprint
        """
        def request_fingerprint(request, include_headers=None, keep_fragments=False):
            if include_headers:    # 取请求头
                include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
            cache = _fingerprint_cache.setdefault(request, {})
            cache_key = (include_headers, keep_fragments)
            if cache_key not in cache:
                fp = hashlib.sha1()    # sha1加密
                fp.update(to_bytes(request.method))
                fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
                fp.update(request.body or b'')
                if include_headers:
                    for hdr in include_headers:
                        if hdr in request.headers:
                            fp.update(hdr)
                            for v in request.headers.getlist(hdr):
                                fp.update(v)
                cache[cache_key] = fp.hexdigest()
            return cache[cache_key]
        """
        return request_fingerprint(request)

    def close(self, reason):
        if self.file:
            self.file.close()

    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
            args = {'request': request, 'referer': referer_str(request)}
            self.logger.debug(msg, args, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
View Code
原文地址:https://www.cnblogs.com/nuochengze/p/13377112.html