爬虫————获取图片(scrapy使用自带类 ImagesPipeline)

爬取的图片网址为:http://699pic.com/collectInfo/273785

解决问题的参考链接:https://blog.csdn.net/loner_fang/article/details/81111879

问题代码1

imgpro.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 
 4 from tupian.items import TupianItem
 5 from scrapy.selector.unified import SelectorList
 6 
 7 
 8 class ImgproSpider(scrapy.Spider):
 9     name = 'imgpro'
10     #allowed_domains = ['699pic.com']
11     start_urls = ['http://699pic.com/collectInfo/273785']
12 
13     def parse(self, response):
14         li_ul = response.xpath('/html/body/div[11]/div/ul')
15  #       print(type(li_ul))
16         for li in li_ul :
17             img_src = li.xpath('./li/a/img/@src').extract()
18            # print(img_src)
19             if img_src:
20                 item = TupianItem()
21                 item['src'] = img_src
22                 yield  item

pipeline.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 
 9 # class TupianPipeline(object):
10 #     def process_item(self, item, spider):
11 #         return item
12 
13 from scrapy.pipelines.images import ImagesPipeline
14 import scrapy
15 
16 class TupianPipeline(ImagesPipeline):
17 
18     # 对某一个媒体资源进行请求发送
19     # item就是接收到的spider提交过来的item
20     def get_media_requests(self, item, info):
21 
22        23             yield scrapy.Request(item['src'])
24 
25         #制定媒体数据存储的名称/html/body/div[11]/div/ul          /html/body/div[11]/div/ul
26     def file_path(self, request, response=None, info=None):
27         name = request.url.split('/')[-1]
28         print("正在下载。。。。。。。",name)
29         return name
30 
31    #将item传递给下一个即将执行的管道类
32     def item_completed(self, results, item, info):
33         return item

错误类型

分析错误原因:

1、在写爬虫文件解析response时,获取图片的下载地址,一开始写的xpath是@src

 修改后为下图所示     

后来排查代码,观察网页源代码,发现源代码中@src的值是http://static.699pic.com/images/blank.png。刚才element中的src是经过渲染之后的值,所以最后采用的@data-original。这也就证实了爬虫获取到的response是网页的源码,爬取之前需要先确认源码和element中的元素和值是否一致,只有一致了才可以直接使用element中的元素和值

 2、遍历image_urls里的每一个url,调用调度器和下载器,下载图片 。 图片下载完毕后,处理结果会以二元组的方式返回给item_completed()函数

错误点为下图

修改后为下图

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 
 4 from tupian.items import TupianItem
 5 from scrapy.selector.unified import SelectorList
 6 
 7 
 8 class ImgproSpider(scrapy.Spider):
 9     name = 'imgpro'
10     #allowed_domains = ['699pic.com']
11     start_urls = ['http://699pic.com/collectInfo/273785']
12 
13     def parse(self, response):
14         li_ul = response.xpath('/html/body/div[11]/div/ul')
15  #       print(type(li_ul))
16         for li in li_ul :
17             img_src = li.xpath('./li/a/img/@data-original').extract()
18            # print(img_src)
19             if img_src:
20                 item = TupianItem()
21                 item['src'] = img_src
22                 yield  item
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 
 9 # class TupianPipeline(object):
10 #     def process_item(self, item, spider):
11 #         return item
12 
13 from scrapy.pipelines.images import ImagesPipeline
14 import scrapy
15 
16 class TupianPipeline(ImagesPipeline):
17 
18     # 对某一个媒体资源进行请求发送
19     # item就是接收到的spider提交过来的item
20     def get_media_requests(self, item, info):
21 
22         for image_url in item['src']:
23             yield scrapy.Request(image_url)
24 
25         #制定媒体数据存储的名称/html/body/div[11]/div/ul          /html/body/div[11]/div/ul
26     def file_path(self, request, response=None, info=None):
27         name = request.url.split('/')[-1]
28         print("正在下载。。。。。。。",name)
29         return name
30 
31    #将item传递给下一个即将执行的管道类
32     def item_completed(self, results, item, info):
33         return item
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://docs.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class TupianItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     src = scrapy.Field()
 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for tupian project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'tupian'
13 
14 SPIDER_MODULES = ['tupian.spiders']
15 NEWSPIDER_MODULE = 'tupian.spiders'
16 
17 
18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 #USER_AGENT = 'tupian (+http://www.yourdomain.com)'
20 
21 # Obey robots.txt rules
22 ROBOTSTXT_OBEY = False
23 
24 LOG_LEVEL = 'ERROR'
25 
26 IMAGES_STORE = './imgsLib'
27 
28 # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 #CONCURRENT_REQUESTS = 32
30 
31 # Configure a delay for requests for the same website (default: 0)
32 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
33 # See also autothrottle settings and docs
34 #DOWNLOAD_DELAY = 3
35 # The download delay setting will honor only one of:
36 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 #CONCURRENT_REQUESTS_PER_IP = 16
38 
39 # Disable cookies (enabled by default)
40 #COOKIES_ENABLED = False
41 
42 # Disable Telnet Console (enabled by default)
43 #TELNETCONSOLE_ENABLED = False
44 
45 # Override the default request headers:
46 DEFAULT_REQUEST_HEADERS = {
47   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48   'Accept-Language': 'en',
49   'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
50 
51 }
52 
53 # Enable or disable spider middlewares
54 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
55 #SPIDER_MIDDLEWARES = {
56 #    'tupian.middlewares.TupianSpiderMiddleware': 543,
57 #}
58 
59 # Enable or disable downloader middlewares
60 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
61 #DOWNLOADER_MIDDLEWARES = {
62 #    'tupian.middlewares.TupianDownloaderMiddleware': 543,
63 #}
64 
65 # Enable or disable extensions
66 # See https://docs.scrapy.org/en/latest/topics/extensions.html
67 #EXTENSIONS = {
68 #    'scrapy.extensions.telnet.TelnetConsole': None,
69 #}
70 
71 # Configure item pipelines
72 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
73 ITEM_PIPELINES = {
74    # 'scrapy.pipelines.images.ImagesPipeline': 1,
75    'tupian.pipelines.TupianPipeline': 300,
76 }
77 
78 # Enable and configure the AutoThrottle extension (disabled by default)
79 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
80 #AUTOTHROTTLE_ENABLED = True
81 # The initial download delay
82 #AUTOTHROTTLE_START_DELAY = 5
83 # The maximum download delay to be set in case of high latencies
84 #AUTOTHROTTLE_MAX_DELAY = 60
85 # The average number of requests Scrapy should be sending in parallel to
86 # each remote server
87 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 # Enable showing throttling stats for every response received:
89 #AUTOTHROTTLE_DEBUG = False
90 
91 # Enable and configure HTTP caching (disabled by default)
92 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 #HTTPCACHE_ENABLED = True
94 #HTTPCACHE_EXPIRATION_SECS = 0
95 #HTTPCACHE_DIR = 'httpcache'
96 #HTTPCACHE_IGNORE_HTTP_CODES = []
97 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'



原文地址:https://www.cnblogs.com/cfancy/p/11973055.html