dupefilter对访问的url做去重

dupefilter对访问的url做去重
    第一步:
        在爬虫文件中chouti.py中
            import scrapy
            from xdb.items import XdbItem
            from scrapy.dupefilters import RFPDupeFilter

            class ChoutiSpider(scrapy.Spider):
                name = 'chouti'
                allowed_domains = ['chouti.com']
                start_urls = ['http://chouti.com/']
                # start_urls = ['http://127.0.0.1:80/app01/login/']

                def parse(self, response):
                    # print(response, type(response))
                    # print(response.text)
                    content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]')
                    for item in content_list:
                        text = item.xpath('./a/text()').extract_first()
                        href = item.xpath('./a/@href').extract_first()
                        yield XdbItem(text=text, href=href)
                        # print(href)
                    page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
                    for page in page_list:
                        from scrapy.http import Request
                        page = "https://dig.chouti.com" + page
                        yield Request(url=page, callback=self.parse)  # 在内部会先调用XdbDupeFilter类中request_seen方法
                        # yield Request(url=page, callback=self.parse, dont_filter=True) # dont_filter为True时不去重规则失效
        
    第二步:
        先创建一个自定义文件dupefilters.py,写
            from scrapy.dupefilters import BaseDupeFilter
            from scrapy.utils.request import request_fingerprint
            class XdbDupeFilter(BaseDupeFilter):
                def __init__(self):
                    self.visited_fd = set()

                @classmethod
                def from_settings(cls, settings):
                    return cls()

                def request_seen(self, request):
                    # 给url进行加密成一个固定的字符串
                    fd = request_fingerprint(request)
                    if fd in self.visited_fd:
                        return True  # 如果返回True表示之前访问过了,不再访问了
                    self.visited_fd.add(fd)

                def open(self):  # can return deferred
                    # 爬虫开始
                    pass

                def close(self, reason):  # can return a deferred
                    # 爬虫结束
                    pass

                def log(self, request, spider):  # log that a request has been filtered
                    pass
                    
    第三步:
        在settings.py中配置
            # 修改默认的去重规则
            # DUPILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
            DUPILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter'
原文地址:https://www.cnblogs.com/xiongfanyong/p/13089996.html