scrapy框架爬取链家旗下贝壳找房资源

1、首先定义容器，爬取资源的字段，items.py文件

class CDErshouFang(scrapy.Item):
    "贝壳二手房"
    house_name = scrapy.Field() #小区名称
    house_address = scrapy.Field() #小区地址
    house_info = scrapy.Field() # 房子信息：楼层、建造时间、户型、建造面积、朝向
    release_time = scrapy.Field() # 发布时间
    house_tags = scrapy.Field() # 标签
    price = scrapy.Field() # 均价
    total_price = scrapy.Field() # 总价
    details = scrapy.Field() #详情页-所在区域
    trading_ownership = scrapy.Field() #交易权属
    commodity_use = scrapy.Field() #商品用途
    house_year = scrapy.Field() #房屋年限
    property = scrapy.Field() #产权所属
    mortgage_information = scrapy.Field() #抵押信息
    room_spare = scrapy.Field() #房本备件

2、编写爬虫文件，解析下载器下载的网页资源，spiders目录下的爬虫文件。

 1 import scrapy
 2 from scrapystudy.items import CDErshouFang
 3 
 4 
 5 class CdErshoufangSpider(scrapy.Spider):
 6     name = 'cd_ershoufang'
 7     allowed_domains = ['cd.ke.com']
 8     start_urls = ['https://cd.ke.com/ershoufang/']
 9 
10     def start_requests(self):
11         "重写start_requests，爬虫将从此处运行url"
12 
13         for page in range(1,100):
14             url = self.start_urls[0] + 'pg' + str(page) + '/'
15             yield scrapy.Request(url=url,callback=self.parse,dont_filter=True)
16 
17     def parse(self, response):
18         SET_SELECT = response.css('.info') #*****，此处定位不正确关系下面的信息读取
19         for cle in SET_SELECT:
20             item = CDErshouFang()
21             house_name = cle.css('.title a::text').extract_first() #用的是cle选择器（下载器下载的网页，不能直接切换操作）
22             house_address = cle.css('.positionInfo a::text').extract_first()
23             house_info = cle.css('.houseInfo::text').extract()[1].replace(' ','').replace('
','')
24             release_time = cle.css('.followInfo::text').extract()[1].replace(' ','').replace('
','')
25             price_total = cle.css('.priceInfo .totalPrice span::text').extract_first()
26             if price_total is not None:
27                 price_total = price_total + '万'
28             price = cle.css('.unitPrice span::text').extract_first()
29             # house_tags = cle.css('.info .address .tag span::text').extract()
30             item["house_name"] = house_name
31             item["house_address"] = house_address
32             item["house_info"] = house_info
33             item["release_time"] = release_time
34             item["total_price"] = price_total
35             item["price"] = price
36             # item["house_tags"] = house_tags
37             details_page_url = cle.css('.title a::attr(href)').extract_first() #详情页超链接
38             # meta：把需要传递的信息赋值给这个叫meta的变量（字典类型），Request中meta参数的作用是传递信息给下一个函数
39             yield scrapy.Request(url=details_page_url,callback=self.details,meta={'item':item})
40 
41     def details(self,response):
42         "详情页数据获取"
43         area = response.xpath('//span[@class="info"]/a[1]/text()').extract_first() #区
44         details = response.xpath('//span[@class="info"]/a[last()]/text()').extract_first()
45         if area is not None or details is not None:
46             details = area + ' ' + details
47         trading_ownership = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[2]/text()').extract_first().strip() # 交易权属
48         commodity_use = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[4]/text()').extract_first().strip() # 房屋用途
49         house_year = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[5]/text()').extract_first().strip() # 房屋年限
50         property = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[6]/text()').extract_first().strip() # 产权所属
51         mortgage_information = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[7]/span[2]/text()').extract_first().strip() # 抵押信息
52         room_spare = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[8]/text()').extract_first().strip() # 房本备件
53         item = response.meta['item'] #取出上一个页面爬取的信息
54         item["details"] = details
55         item["trading_ownership"] = trading_ownership
56         item["commodity_use"] = commodity_use
57         item["house_year"] = house_year
58         item["property"] = property
59         item["mortgage_information"] = mortgage_information
60         item["room_spare"] = room_spare
61         yield item

3、处理爬取的数据，pipelines.py文件编写数据存储在MongoDB数据库或者yaml、json文件

  1 # Define your item pipelines here
  2 #
  3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5 
  6 
  7 # useful for handling different item types with a single interface
  8 from itemadapter import ItemAdapter
  9 from scrapy.exceptions import DropItem
 10 import pymongo
 11 
 12 class TextPipeline:
 13     "要是用pipelines要先注册，去setting.py添加配置"
 14 
 15     def __init__(self):
 16         self.limit = 50
 17 
 18     def process_item(self, item, spider):
 19         "处理items长度"
 20         if item['title']:
 21             if len(item['title'])>self.limit:
 22                 item['title'] = item['title'][0:self.limit].rstrip()+'...'
 23             return item
 24         else:
 25             return DropItem('Missing Text')
 26 
 27 class MongoPipeline(object):
 28     "将数据存储在MongoDB中"
 29 
 30     def __init__(self,mongo_url,mongo_db):
 31         self.mongo_url = mongo_url
 32         self.mongo_db = mongo_db
 33 
 34     @classmethod
 35     def from_crawler(cls,crawler):
 36         "获取到setting全局的配置"
 37         return cls(
 38             mongo_url = crawler.settings.get('MONGO_URL'),
 39             mongo_db = crawler.settings.get('MONGO_DB')
 40         )
 41 
 42     def open_spider(self,spider):
 43         self.client = pymongo.MongoClient(self.mongo_url)
 44         self.db = self.client[self.mongo_db]
 45 
 46     def process_item(self,item,spider):
 47         name = item.__class__.__name__
 48         self.db[name].insert(dict(item))
 49         return item
 50 
 51     def close_spider(self,spider):
 52         self.client.close()
 53 
 54 import os
 55 import time
 56 import logging
 57 import yaml
 58 logger = logging.getLogger(__name__)
 59 
 60 class SaveBeikePipeline(object):
 61     "保存爬虫数据到yaml、json文件"
 62 
 63     def open_spider(self,spider):
 64         "spider打开时启动，该方法被调用，这里打开或新建一个文件"
 65         filetime = time.strftime("%Y%m%d")
 66         filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
 67         if not os.path.exists(filepath) : os.mkdir(filepath)
 68         # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
 69         spider_file = filepath + '{}.yaml' .format(filetime)
 70         try:
 71             self.f = open(spider_file, mode='w', encoding='utf-8')
 72         except Exception as e:
 73             logger.error(e)
 74 
 75     def process_item(self,item,spider):
 76         "处理数据"
 77         data = dict()
 78         data["小区名称"] = item["house_name"]
 79         data["在售状态"] =  item["on_sale"]
 80         data["房屋类型"] = item["house_type"]
 81         data["小区地址"] = item["address"]
 82         data["房屋户型"] = item["door_module"]
 83         data["建筑面积"] = item["area"]
 84         data["价格"] = item["price"]
 85         data["总价/套"] = item["total_price"]
 86         data["附近设施"] = item["tags"]
 87         # self.f.write(str(data)+'
')
 88         spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示
 89         self.f.write(spider_data+'*'.center(50,'-')+'
')
 90         return item
 91 
 92     def close_spider(self,spider):
 93         "scrapy结束时启动，用来关掉文件"
 94         self.f.close()
 95 
 96 class SaveCDershouFangPipeline(object):
 97     "保存爬虫数据到yaml"
 98 
 99     def open_spider(self,spider):
100         "spider打开时启动，该方法被调用，这里打开或新建一个文件"
101         filetime = time.strftime("%Y%m%d")
102         filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
103         if not os.path.exists(filepath) : os.mkdir(filepath)
104         # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
105         spider_file = filepath + 'cd_ershoufang{}.yaml' .format(filetime)
106         try:
107             self.f = open(spider_file, mode='w', encoding='utf-8')
108         except Exception as e:
109             logger.error(e)
110 
111     def process_item(self,item,spider):
112         "处理数据"
113         data = dict()
114         data["小区名称"] = item["house_name"]
115         data["小区地址"] =  item["house_address"]
116         data["房子信息"] = item["house_info"]
117         data["发布时间"] = item["release_time"]
118         data["总价/套"] = item["total_price"]
119         data["均价"] = item["price"]
120         # data["标签"] = item["house_tags"]
121         data["所在区域"] = item["details"]
122         data["交易权属"] = item["trading_ownership"]
123         data["房屋用途"] = item["commodity_use"]
124         data["房屋年限"] = item["house_year"]
125         data["产权所属"] = item["property"]
126         data["抵押信息"] = item["mortgage_information"]
127         data["房本备件"] = item["room_spare"]
128         spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示,默认为True
129         self.f.write(spider_data+'*'.center(60,'-')+'
')
130         return item
131 
132     def close_spider(self,spider):
133         "scrapy结束时启动，用来关掉文件"
134         self.f.close()

4、部分页面分页无法获取网页，需要与selenium结合起来，将selenium操作的资源封装成response资源并发送给spiders进行解析，middlewares.py

  1 # Define here the models for your spider middleware
  2 #
  3 # See documentation in:
  4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 
  6 from scrapy import signals
  7 
  8 # useful for handling different item types with a single interface
  9 from itemadapter import is_item, ItemAdapter
 10 
 11 
 12 class ScrapystudySpiderMiddleware:
 13     # Not all methods need to be defined. If a method is not defined,
 14     # scrapy acts as if the spider middleware does not modify the
 15     # passed objects.
 16 
 17     @classmethod
 18     def from_crawler(cls, crawler):
 19         # This method is used by Scrapy to create your spiders.
 20         s = cls()
 21         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22         return s
 23 
 24     def process_spider_input(self, response, spider):
 25         # Called for each response that goes through the spider
 26         # middleware and into the spider.
 27 
 28         # Should return None or raise an exception.
 29         return None
 30 
 31     def process_spider_output(self, response, result, spider):
 32         # Called with the results returned from the Spider, after
 33         # it has processed the response.
 34 
 35         # Must return an iterable of Request, or item objects.
 36         for i in result:
 37             yield i
 38 
 39     def process_spider_exception(self, response, exception, spider):
 40         # Called when a spider or process_spider_input() method
 41         # (from other spider middleware) raises an exception.
 42 
 43         # Should return either None or an iterable of Request or item objects.
 44         pass
 45 
 46     def process_start_requests(self, start_requests, spider):
 47         # Called with the start requests of the spider, and works
 48         # similarly to the process_spider_output() method, except
 49         # that it doesn’t have a response associated.
 50 
 51         # Must return only requests (not items).
 52         for r in start_requests:
 53             yield r
 54 
 55     def spider_opened(self, spider):
 56         spider.logger.info('Spider opened: %s' % spider.name)
 57 
 58 
 59 class ScrapystudyDownloaderMiddleware:
 60     # Not all methods need to be defined. If a method is not defined,
 61     # scrapy acts as if the downloader middleware does not modify the
 62     # passed objects.
 63 
 64     @classmethod
 65     def from_crawler(cls, crawler):
 66         # This method is used by Scrapy to create your spiders.
 67         s = cls()
 68         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69         return s
 70 
 71     def process_request(self, request, spider):
 72         # Called for each request that goes through the downloader
 73         # middleware.
 74         # request.cookie = {
 75         #     "Cookie":"__mta=108386109.1609123577452.1610351007435.1610351353409.13; __mta=108386109.1609123577452.1610351353409.1610362706394.14; uuid_n_v=v1; _lxsdk_cuid=176a73d3e42c8-057a36937583e8-c791039-149c48-176a73d3e42c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; _csrf=1d012800348e02304158b04bcaacdb15959e3482e6847893721b340ca6f29323; lt=8kvWp1o5sQYEgkrZTHbti6H0uI8AAAAAhgwAADxF8ufwXVyR4TU3_BGMHAKsB_TA6toYFjxg-m34Z43vNJlCb9Bv05PqTeelhSHITw; lt.sig=iPSGNXFnd3jV3SEy7wzqa0L_QOw; uid=2829236546; uid.sig=fiHM__7YgLUMEaZ05TkEQaVApbs; _lxsdk=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1609123577,1609148969,1610350992,1610362253; __mta=108386109.1609123577452.1610362628562.1610362689900.15; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1610362706; _lxsdk_s=176f0edcffa-620-f33-c24%7C%7C53",
 76         #     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
 77         # }
 78 
 79         # Must either:
 80         # - return None: continue processing this request
 81         # - or return a Response object
 82         # - or return a Request object
 83         # - or raise IgnoreRequest: process_exception() methods of
 84         #   installed downloader middleware will be called
 85         return None
 86 
 87     def process_response(self, request, response, spider):
 88         # Called with the response returned from the downloader.
 89 
 90         # Must either;
 91         # - return a Response object
 92         # - return a Request object
 93         # - or raise IgnoreRequest
 94         return response
 95 
 96     def process_exception(self, request, exception, spider):
 97         # Called when a download handler or a process_request()
 98         # (from other downloader middleware) raises an exception.
 99 
100         # Must either:
101         # - return None: continue processing this exception
102         # - return a Response object: stops process_exception() chain
103         # - return a Request object: stops process_exception() chain
104         pass
105 
106     def spider_opened(self, spider):
107         spider.logger.info('Spider opened: %s' % spider.name)
108 
109 # import logging
110 # class ProxyMiddleware(object):
111 #     "设置中间件代理"
112 #     logger = logging.getLogger(__name__)
113 #     def process_request(self,request,spider):
114 #         self.logger.debug("Using Proxy")
115 #         request.meta["proxy"] = "http://125.87.105.4:49713"
116 
117 from selenium import webdriver
118 from selenium.webdriver.common.by import By
119 from selenium.webdriver.support.ui import WebDriverWait
120 from selenium.webdriver.support import expected_conditions as EC
121 from selenium.common.exceptions import TimeoutException
122 from scrapy.http import HtmlResponse
123 from selenium.webdriver.chrome.options import Options
124 import logging
125 import time
126 
127 logger = logging.getLogger(__name__)
128 
129 class SeleniumMiddleware(object):  #？？如何将多个HtnlResponse对象传给spider进行解析？？？？
130 
131     def process_request(self,request,spider):
132         url = request.url
133         opt = Options()
134         opt.add_argument('--headless')
135         # 创建谷歌浏览器对象
136         browser = webdriver.Chrome()
137         wait = WebDriverWait(browser,10)
138         browser.get(url)
139         htmls = []
140         for page in range(2,3):
141             try:
142                 next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > a.next")))
143                 next_page.click()
144                 # 判断当前页码是否为当前页
145                 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > span.active"),str(page)))
146             except TimeoutException:
147                 continue
148             browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
149             time.sleep(2)
150         html = browser.page_source  # 返回网页源码
151         logger.info("获取到的URL："+request.url)
152         # browser.quit()
153         return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8')

5、激活pipelines项目管道和middlewares中间件，setting.py

 1 # Scrapy settings for scrapystudy project
 2 #
 3 # For simplicity, this file contains only settings considered important or
 4 # commonly used. You can find more settings consulting the documentation:
 5 #
 6 #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 
10 BOT_NAME = 'scrapystudy'
11 
12 SPIDER_MODULES = ['scrapystudy.spiders']
13 NEWSPIDER_MODULE = 'scrapystudy.spiders'
14 
15 MONGO_URL = "localhost"
16 MONGO_DB = "mydb"
17 
18 
19 # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 #USER_AGENT = 'scrapystudy (+http://www.yourdomain.com)'
21 
22 # Obey robots.txt rules
23 ROBOTSTXT_OBEY = False
24 
25 # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 #CONCURRENT_REQUESTS = 32
27 
28 # Configure a delay for requests for the same website (default: 0)
29 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
30 # See also autothrottle settings and docs
31 #DOWNLOAD_DELAY = 3
32 # The download delay setting will honor only one of:
33 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 #CONCURRENT_REQUESTS_PER_IP = 16
35 
36 # Disable cookies (enabled by default)
37 # COOKIES_ENABLED = True
38 
39 # Disable Telnet Console (enabled by default)
40 #TELNETCONSOLE_ENABLED = False
41 
42 # Override the default request headers:
43 # DEFAULT_REQUEST_HEADERS = {
44 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 #   'Accept-Language': 'en',
46 #   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
47 # }
48 
49 # Enable or disable spider middlewares
50 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
51 # SPIDER_MIDDLEWARES = {
52 #    'scrapystudy.middlewares.MyFirstSpiderMiddleware': 543,
53 # }
54 
55 # Enable or disable downloader middlewares
56 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
57 # DOWNLOADER_MIDDLEWARES = {
58 #    'scrapystudy.middlewares.SeleniumMiddleware': 543,
59 # }
60 
61 # Enable or disable extensions
62 # See https://docs.scrapy.org/en/latest/topics/extensions.html
63 #EXTENSIONS = {
64 #    'scrapy.extensions.telnet.TelnetConsole': None,
65 #}
66 
67 # Configure item pipelines
68 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69 # 300,400表示执行顺序，越小代表优先级越高，越先执行
70 ITEM_PIPELINES = {
71    'scrapystudy.pipelines.SaveCDershouFangPipeline': 600,
72    # 'scrapystudy.pipelines.TextPipeline': 300,
73    # 'scrapystudy.pipelines.MongoPipeline': 400,
74    # 'scrapystudy.pipelines.SaveBeikePipeline': 500,
75 }
76 
77 # Enable and configure the AutoThrottle extension (disabled by default)
78 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
79 #AUTOTHROTTLE_ENABLED = True
80 # The initial download delay
81 #AUTOTHROTTLE_START_DELAY = 5
82 # The maximum download delay to be set in case of high latencies
83 #AUTOTHROTTLE_MAX_DELAY = 60
84 # The average number of requests Scrapy should be sending in parallel to
85 # each remote server
86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 # Enable showing throttling stats for every response received:
88 #AUTOTHROTTLE_DEBUG = False
89 
90 # Enable and configure HTTP caching (disabled by default)
91 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 #HTTPCACHE_ENABLED = True
93 #HTTPCACHE_EXPIRATION_SECS = 0
94 #HTTPCACHE_DIR = 'httpcache'
95 #HTTPCACHE_IGNORE_HTTP_CODES = []
96 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'