scrapy中间件

一、下载中间件

from scrapy import signals
from scrapy.http import Response
from scrapy.exceptions import IgnoreRequest
from AMAZON.proxy_handle import get_proxy,delete_proxy
# print('eeeeeeeeeeee',get_proxy())

class DownMiddleware1(object):
    def process_request(self, request, spider):
        """
        请求需要被下载时,经过所有下载器中间件的process_request调用
        :param request: 
        :param spider: 
        :return:  
            None,继续后续中间件去下载;
            Response对象,停止process_request的执行,开始执行process_response
            Request对象,停止中间件的执行,将Request重新调度器
            raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
        """
        # spider.name
        print('下载中间件1')
        print('gggggggggggggggggggg',get_proxy())

        # request.meta['proxy']='http://user:pwd@ip:port'
        request.meta['download_timeout']=10
        request.meta['proxy']='http://'+get_proxy()
        print(request.meta)
        # return Response('http://www.xxx.com')
        # print(request.dont_filter)
        # return request
        # raise IgnoreRequest
        # raise TimeoutError

    def process_response(self, request, response, spider):
        """
        spider处理完成,返回时调用
        :param response:
        :param result:
        :param spider:
        :return: 
            Response 对象:转交给其他中间件process_response
            Request 对象:停止中间件,request会被重新调度下载
            raise IgnoreRequest 异常:调用Request.errback
        """
        print('response1')
        return response

    def process_exception(self, request, exception, spider):
        """
        当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
        :param response:
        :param exception:
        :param spider:
        :return: 
            None:继续交给后续中间件处理异常;
            Response对象:停止后续process_exception方法
            Request对象:停止中间件,request将会被重新调用下载
        """
        print('异常1')
        # return None

        # 删旧代理 delelte request.meta['proxy']
        old_proxy=request.meta['proxy'].split("//")[-1]
        print('oooooooooooo',old_proxy)
        delete_proxy(old_proxy)

        request.meta['proxy']='http://'+get_proxy()
        return request
原文地址:https://www.cnblogs.com/ldq1996/p/8342112.html