对空气质量历史数据的爬取

爬取https://www.aqistudy.cn/historydata 网站的空气质量报告，爬取的数据以csv文件生成

scrapy startproject air_quality 创建scrapy项目

scrapy genspider api_history_spider https://www.apistudy.cn/historydata/index.php 编写spider

文件目录如图所示

seetings.py

1 ITEM_PIPELINES = {
2     'air_quality.pipelines.AirQualityPipeline': 300,
3 }

items.py

 1 import scrapy
 2 
 3 
 4 class AirQualityItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     city_name = scrapy.Field()  # 城市名称
 8     record_date = scrapy.Field()  # 检测日期
 9     aqi_val = scrapy.Field()  # AQI
10     range_val = scrapy.Field()  # 范围
11     quality_level = scrapy.Field()  # 质量等级
12     pm2_5_val = scrapy.Field()  # PM2.5
13     pm10_val = scrapy.Field()  # PM10
14     so2_val = scrapy.Field()  # SO2
15     co_val = scrapy.Field()  # CO
16     no2_val = scrapy.Field()  # NO2
17     o3_val = scrapy.Field()  # O3
18     rank = scrapy.Field()  # 排名

pipelines.py

 1 from scrapy.exporters import CsvItemExporter
 2 
 3 class AirQualityPipeline(object):
 4 
 5     def open_spider(self,spider):
 6         self.file = open('air_quality.csv', 'wb')
 7         self.exporter = CsvItemExporter(self.file)
 8         self.exporter.start_exporting()
 9 
10     def close_spider(self,spider):
11         self.exporter.finish_exporting()
12         self.file.close()
13 
14     def process_item(self, item,spider):
15         self.exporter.export_item(item)
16         return item

api_history_spider.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from urllib import parse
 4 from air_quality.items import AirQualityItem
 5 
 6 base_url = 'https://www.aqistudy.cn/historydata/'
 7 
 8 class ApiHistorySpiderSpider(scrapy.Spider):
 9     name = 'api_history_spider'
10     allowed_domains = ["aqistudy.cn"]
11     start_urls = ['https://www.aqistudy.cn/historydata/']
12 
13     def parse(self, response):
14         """
15             解析初始页面
16         """
17         # 获取所有城市的URL
18         city_url_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//@href')
19 
20         for city_url in city_url_list:
21             # 依次遍历城市URL
22             city_month_url = base_url + city_url.extract()
23             # 解析每个城市的月份数据
24             request = scrapy.Request(city_month_url, callback=self.parse_city_month)
25             yield request
26 
27     def parse_city_month(self, response):
28         """
29             解析该城市的月份数据
30         """
31         # 获取该城市的所有月份URL
32         month_url_list = response.xpath('//table[@class="table table-condensed '
33                                         'table-bordered table-striped table-hover '
34                                         'table-responsive"]//a//@href')
35 
36         for month_url in month_url_list:
37             # 依次遍历月份URL
38             city_day_url = base_url + month_url.extract()
39             # 解析该城市的每日数据
40             request = scrapy.Request(city_day_url, callback=self.parse_city_day)
41             yield request
42 
43     def parse_city_day(self, response):
44         """
45             解析该城市的每日数据
46         """
47         url = response.url
48         item = AirQualityItem()
49         city_url_name = url[url.find('=') + 1:url.find('&')]
50 
51         # 解析url中文
52         # item['city_name'] = city_url_name
53         item['city_name'] = parse.unquote(city_url_name)
54 
55         # 获取每日记录
56         day_record_list = response.xpath('//table[@class="table table-condensed '
57                                          'table-bordered table-striped table-hover '
58                                          'table-responsive"]//tr')
59         for i, day_record in enumerate(day_record_list):
60             if i == 0:
61                 # 跳过表头
62                 continue
63             td_list = day_record.xpath('.//td')
64 
65             item['record_date'] = td_list[0].xpath('text()').extract_first()  # 检测日期
66             item['aqi_val'] = td_list[1].xpath('text()').extract_first()  # AQI
67             item['range_val'] = td_list[2].xpath('text()').extract_first()  # 范围
68             item['quality_level'] = td_list[3].xpath('.//div/text()').extract_first()  # 质量等级
69             item['pm2_5_val'] = td_list[4].xpath('text()').extract_first()  # PM2.5
70             item['pm10_val'] = td_list[5].xpath('text()').extract_first()  # PM10
71             item['so2_val'] = td_list[6].xpath('text()').extract_first()  # SO2
72             item['co_val'] = td_list[7].xpath('text()').extract_first()  # CO
73             item['no2_val'] = td_list[8].xpath('text()').extract_first()  # NO2
74             item['o3_val'] = td_list[9].xpath('text()').extract_first()  # O3
75             item['rank'] = td_list[10].xpath('text()').extract_first()  # 排名
76 
77             yield item

运行spider

scrapy crawl api_history_spider

得到csv文件，部分如下图所示：