爬虫_1


import urllib.request
import time,re
import pandas as pd
from datetime import datetime, timedelta
# from tqdm import tqdm

# def get_data(base_url,tt):
# html=urllib.request.urlopen(base_url).read().decode('utf-8')
# html = str(html)
# key_names =re.compile('.*?"NAME_":"(.*?)",')
# key_aqi =re.compile('.*?,"AQI_":(.*?),')
# PRIMARY_POLLUTANTS_ =re.compile('.*?"PRIMARY_POLLUTANTS_":"(.*?)",')
# names = re.findall( key_names, html)
# aqi = re.findall( key_aqi, html)
# PRIMARY_POLLUTANTS_=re.findall(PRIMARY_POLLUTANTS_,html)
# data = pd.DataFrame()
# data['市/区/县'] = names
# data['AQI_'+str(tt)] = aqi
# data['首污_'+str(tt)]=PRIMARY_POLLUTANTS_
# return data

def get_data_chengdu(base_url,tt):
html=urllib.request.urlopen(base_url).read().decode('utf-8')
html = str(html)
key_names =re.compile('.*?{"CITY_":"(.*?)",')
key_aqi =re.compile('.*?"AQI_":"(.*?)",')
PRIMARY_POLLUTANTS_ =re.compile('.*?,"PRIMARY_POLLUTANTS_":"(.*?)"}')
names = re.findall( key_names, html)
aqi = re.findall( key_aqi, html)
PRIMARY_POLLUTANTS_=re.findall(PRIMARY_POLLUTANTS_,html)
data = pd.DataFrame()
data['市/区/县'] = names
data['AQI_'+str(tt)] = aqi
data['首污_'+str(tt)]=PRIMARY_POLLUTANTS_
return data


# def day_chengdu_area(days_2018,data_days):
# for day_index in range(days_2018-3):
# tt = datetime.strptime('20180102', "%Y%m%d") + timedelta(days=day_index)
# tt = tt.strftime("%Y%m%d")
# base_url = r'http://weixin.cdepb.gov.cn:20005/data/w/%E5%8C%BA%E5%8E%BF%E6%97%A5%E6%8E%92%E5%90%8D?'
# r'token=78C579FA28C9B1E6C156A806C392458C&date={}&rows=1000&page=1'.format(tt)
# # print(base_url)
# data=get_data(base_url,tt)
# data.index=data['市/区/县']
# # data.sort_values(by=['市/区/县'])
# # data.sort(['市/区/县'])
# # print(data)
# # data_days=data_days.set_index('市/区/县').join(data.set_index('市/区/县'))
# data_days=pd.merge(data_days,data,on='市/区/县')
#
# # data_days.join(data, lsuffix='_caller', rsuffix='_other')
# print(data_days)
# # time.sleep(60)
# break
# # data_days.to_excel(r'data_days.xlsx')



def day_chengdu(days_2018,data_days):
for day_index in range(days_2018-3):
tt = datetime.strptime('20180102', "%Y%m%d") + timedelta(days=day_index)
tt = tt.strftime("%Y%m%d")
base_url = r'http://weixin.cdepb.gov.cn:20005/data/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%90%84%E5%9F%8E%E5%B8%82%E7%A9%'
r'BA%E6%B0%94%E8%B4%A8%E9%87%8F%E6%97%A5%E6%95%B0%E6%8D%AE?token=78C579FA28C9B1E6C156A806C392458C&date={}&rows=1000&page=1'.format(tt)
# print(base_url)
data=get_data_chengdu(base_url,tt)
data.index=data['市/区/县']
# data.sort_values(by=['市/区/县'])
# data.sort(['市/区/县'])
# print(data)
# data_days=data_days.set_index('市/区/县').join(data.set_index('市/区/县'))
data_days=pd.merge(data_days,data,on='市/区/县')

# data_days.join(data, lsuffix='_caller', rsuffix='_other')
# print(data_days)
# time.sleep(60)
# break
print(data_days)
data_days.to_excel(r'data_days.xlsx')

if __name__ == '__main__':

# 获取并保存 每天 排名 区县 AQI 首要污染物
# days_2018=time.localtime(time.time()).tm_yday
# days_2017=365
# days_2016=366
# day_sum=days_2018+days_2017+days_2016
# # print(day_sum)
# tt = datetime.strptime('20180101', "%Y%m%d") + timedelta(days=0)
# tt = tt.strftime("%Y%m%d")
#
# base_url = r'http://weixin.cdepb.gov.cn:20005/data/w/%E5%8C%BA%E5%8E%BF%E6%97%A5%E6%8E%92%E5%90%8D?'
# r'token=78C579FA28C9B1E6C156A806C392458C&date={}&rows=1000&page=1'.format(tt)
# data_days=get_data(base_url,tt)
# day_chengdu_area(days_2018, data_days)

# 获取成都总体的AQI
days_2018 = time.localtime(time.time()).tm_yday
tt = datetime.strptime('20180101', "%Y%m%d") + timedelta(days=0)
tt = tt.strftime("%Y%m%d")
chengdu_url=r'http://weixin.cdepb.gov.cn:20005/data/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%90%84%E5%9F%8E%E5%B8%82%E7%A9%BA'
r'%E6%B0%94%E8%B4%A8%E9%87%8F%E6%97%A5%E6%95%B0%E6%8D%AE?token=78C579FA28C9B1E6C156A806C392458C&date={}&rows=1000&page=1'.format(tt)
data_chengdu=get_data_chengdu(chengdu_url,tt)
# print(data_chengdu)
day_chengdu(days_2018, data_chengdu)


原文地址:https://www.cnblogs.com/avivi/p/11354299.html