scrapy-redis 之处理异常

今天心情不好 不想多打字 自己看注释吧 

 1 from scrapy.http import HtmlResponse
 2 from twisted.internet import defer
 3 from twisted.internet.error import TimeoutError, DNSLookupError, 
 4     ConnectionRefusedError, ConnectionDone, ConnectError, 
 5     ConnectionLost, TCPTimedOutError
 6 from twisted.web.client import ResponseFailed
 7 from scrapy.core.downloader.handlers.http11 import TunnelError
 8 
 9 
10 class ProcessAllExceptionMiddleware(object):
11     ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
12                       ConnectionRefusedError, ConnectionDone, ConnectError,
13                       ConnectionLost, TCPTimedOutError, ResponseFailed,
14                       IOError, TunnelError)
15 
16     def process_response(self, request, response, spider):
17         # 捕获状态码为40x/50x的response
18         if str(response.status).startswith('4') or str(response.status).startswith('5'):
19             # 随意封装,直接返回response,spider代码中根据url==''来处理response
20             response = HtmlResponse(url=str(response.status), status=200)
21             return response
22         # 其他状态码不处理
23         return response
24 
25     def process_exception(self, request, exception, spider):
26         # 捕获几乎所有的异常
27         if isinstance(exception, self.ALL_EXCEPTIONS):
28             # 在日志中打印异常类型
29             print('Got exception: %s' % (exception))
30             # 随意封装一个response,返回给spider
31             response = HtmlResponse(url='exception')
32             return response
33         # 打印出未捕获到的异常
34         print('not contained exception: %s' % exception)

然后根据返回的url不同就可以在spider中进行各种处理了

原文地址:https://www.cnblogs.com/ltn26/p/10167486.html