爬取文件时,对已经操作过的URL进行过滤

爬取文件时,对已经操作过的URL进行过滤

1.创建过滤规则文件filter.py在spiders同级目录

class RepeatUrl:
    def __init__(self):
        self.visited_url = set()

    @classmethod
    def from_settings(cls, settings):
        """
        初始化时,调用
        :param settings:
        :return:
        """
        return cls()

    def request_seen(self, request):
        """
        检测当前请求是否已经被访问过
        :param request:
        :return: True表示已经访问过;False表示未访问过
        """
        if request.url in self.visited_url:
            return True
        self.visited_url.add(request.url)
        return False

    def open(self):
        """
        开始爬去请求时,调用
        :return:
        """
        print('open replication')

    def close(self, reason):
        """
        结束爬虫爬取时,调用
        :param reason:
        :return:
        """
        print('close replication')

    def log(self, request, spider):
        """
        记录日志
        :param request:
        :param spider:
        :return:
        """
        print('repeat', request.url)

2.在settings.py中指定配置文件

# 定义过滤规则
DUPEFILTER_CLASS = 'sp1.filter.RepeatUrl'

# DUPEFILTER_DEBUG = False
# JOBDIR = "保存范文记录的日志路径,如:/root/"   # 最终路径为 /root/requests.seen
原文地址:https://www.cnblogs.com/baolin2200/p/8552366.html