爬虫框架scrapy(2)post请求,传递item参数,加速爬虫效率,UA池及代理池

scrapy 发送post请求

class PostSpider(scrapy.Spider):
    name = 'post'
   
    start_urls = ['https://fanyi.baidu.com/sug']
    #原始作用:将起始url列表中的url进行get请求的发送.
    #通过如下操作进行父类方法的重写,让其进行post请求的发送
    def start_requests(self):
        data = {
            'kw':'dog'
        }
        for url in self.start_urls:
            yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
        
    def parse(self, response):
        print(response.text)

核心:
重写父类 start_requests方法,默认的 start_requests方法提交的是yield scrapy.Request(url=url,formdata=formdata,callback=self.parse)这种get请求,
改写为 yield scrapy.FormRequest(url=url,formdata=formdata,callback=self.parse) formdata为传递参数

传递item参数,在参数中添加 meta

def parse(self, response):
        div_list = response.xpath('//div[@class="col-xs-1-5 movie-item"]')

        for div in div_list:
           item = MovieproItem()
           item['title'] = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
           item['score'] = div.xpath('.//div[@class="meta"]/h1/em/text()').extract_first()

           detail_url ='https:'+ div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()

           yield scrapy.Request(url=detail_url,callback=self.getdata,meta={'item':item})

    def getdata(self,respose):
        item =respose.meta['item']
        item["deactor"]=respose.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
        item["desc"]=respose.xpath('//div[@class="col-xs-12 movie-introduce"]/p/text()').extract_first()

        yield item

加速爬虫效率在settings中配置以下

加速爬虫效率
CONCURRENT_REQUESTS = 10 开启线程数量
LOG_LEVEL = 'ERROR'      打印日志等级
COOKIES_ENABLED = False	 对于不需要处理cookies
RETRY_ENABLED = False    是否重试
DOWNLOAD_TIMEOUT = 5	 超时处理

UA池和代理池

在中间件文件中按照如下配置,代理池可以再www.goubanjia.com中找取,注意http与https

class ProxyproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    proxy_http = ['http://39.137.168.229:8080', 'http://103.218.240.182:80', 'http://80.26.152.146:60133']
    proxy_https = ['https://221.6.201.18:9999', 'https://220.180.50.14:53281', 'https://140.227.200.38:3128']
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

    def process_request(self, request, spider):
        print('下载中间件',request)

        if request.url.split(':')[0] == 'http':
            request.meta['proxy'] = random.choice(self.proxy_http)
        else:
            request.meta['proxy'] = random.choice(self.proxy_https)

        request.headers['User-Agent'] = random.choice(self.user_agent_list)
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

  

  

原文地址:https://www.cnblogs.com/wszxdzd/p/10269210.html