python之天气爬虫

代码已调试通过

# 导入第三方包
import random
import re
import time
import pandas as pd
import requests

# 构造请求头
headers = {
    'Accept': '*/*',
    'Accept -Enconding': 'gzip,deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'conection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0  (windows NT 6.1;  WOW64)  AppleWebKit/537.36 (KHTML,like Gecko) Chrome/63.0.3236.0 '
                  'Safari/537.36 '
}
# 生成所有需要抓取的链接
urls = []
for year in range(2012, 2019):
    for month in range(1, 13):
        if year <= 2016:
            urls.append('http://tianqi.2345.com/t/wea_history/js/58362_%s%s.js' % (year, month))
        else:
            if month < 10:
                print("未获取天气数据")
                break;
info = []
for url in urls:
    random.randint(3, 6)
    response = requests.get(url, headers=headers).text  # 发送url链接的请求,并返回响应数据
    print(response)
    city=re.findall("city:'(.*?)',", "".join(response))  # 正则表达式获取城市
    ymd = re.findall("ymd:'(.*?)',", "".join(response))  # 正则表达式获取日期数据
    high = re.findall(",bWendu:'(.*?)',", "".join(response))  # 正则表达式获取最高气温数据,正则表达式不加最前面的逗号,容易多匹配avgbWendu字段
    low = re.findall(",yWendu:'(.*?)',", "".join(response))  # 正则表达式获取最低气温数据
    tianqi = re.findall("tianqi:'(.*?)',", "".join(response))  # 正则表达式获取天气状况数据
    fengxiang = re.findall("fengxiang:'(.*?)',", "".join(response))  # 正则表达式获取风向数据
    aqi = re.findall("aqi:'(.*?)',", "".join(response))  # 正则表达式获取空气质量指标数据
    aqiInfo = re.findall("aqiInfo:'(.*?)',", "".join(response))  # 正则表达式获取空气质量说明数据
    aqiLevel = re.findall("aqiLevel:'(.*?)'}", "".join(response))  # 正则表达式获取空气质量水平数据
    maxWendu=re.findall("maxWendu:'(.*?)',", "".join(response))  # 正则表达式获取最高 温度
    minWendu = re.findall("maxWendu:'(.*?)',", "".join(response))  # 正则表达式获取最低温度
    avgbWendu = re.findall("avgbWendu:'(.*?)',", "".join(response))  # 正则表达式获取平均白天温度

    # 犹豫 2012-2015没有空气质量相关的数据,故需要分开处理
    # 循环并通过正则匹配获取相关数据


    if len(aqi) == 0:
        fengli = re.findall("fengli:'(.*?)'}", "".join(response))  # 正则表达式获取风力数据
        avgyWendu = re.findall("avgyWendu:'(.*?)'}", "".join(response))  # 正则表达式获取平均夜里温度
        aqi = ''
        aqiInfo = ''
        aqiLevel = ''
        df = pd.DataFrame.from_dict(
            {'city': city, 'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang,
             'fengli': fengli, 'aqi': aqi,
             'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel, 'maxWendu': maxWendu, 'minWendu': minWendu,
             'avgbWendu': avgbWendu, 'avgyWendu': avgyWendu}, orient='index')
        pl = df.transpose()
        info.append(pl)

    else:
        fengli = re.findall("fengli:'(.*?)',", "".join(response))  # 正则表达式获取风力数据
        avgyWendu = re.findall("avgyWendu:'(.*?)',", "".join(response))  # 正则表达式获取平均夜里温度
        df = pd.DataFrame.from_dict(
        {'city':city,'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang, 'fengli': fengli, 'aqi': aqi,
         'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel,'maxWendu':maxWendu,'minWendu':minWendu,'avgbWendu':avgbWendu,'avgyWendu':avgyWendu}, orient='index')  #
        pl = df.transpose()
        info.append(pl)

time.sleep(3)  # 每循环一次,都随机停顿几秒
# 将存储的所有天气数据进行合并,生成数据表格
weather = pd.concat(info)
# 数据导出
time = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
weather.to_csv('weather_new' + time + '.csv', index=False)

  运行结果如下:

原文地址:https://www.cnblogs.com/mumianhuasayyes/p/15802301.html