毕业设计-1.06

情况概述:

  发现问题,对代码进行优化改错,并且重新爬取2011-2020年天气数据。

代码更正:

import requests
from bs4 import BeautifulSoup
from Weather import IO as ios


class item:
    def __init__(self):
        self.date = list()  # 日期
        self.max_temp = list()  # 最高温
        self.min_temp = list()  # 最低温
        self.weather = list()  # 天气
        self.wind_direction = list()  # 风向
        self.wind_force=list()  # 风力


Data_Box = item()  # 数据盒子
num=0

# 函数默认返回北京市2018年1月到12月的url
def get_url(city,min,max):
    '''
    city为城市拼写的字符串,year为年份+月份
    '''
    for i in range(min,max):
        for time in range(int(str(i)+"01"), int(str(i)+"13")):
            url = "http://lishi.tianqi.com/{}/{}.html".format(city, time)
            yield url+" "+city


# 获取天气数据
def get_datas(min,max):
    global num
    for line in open("CityEn_Deal.txt",encoding='utf-8'):
        print(str(line.split(" ")[1]).strip("
"))
        urls = get_url(str(line.split(" ")[2]).strip("
"),min,max)
        cookie = {
            "cityPy": "UM_distinctid=171f2280ef23fb-02a4939f3c1bd4-335e4e71-144000-171f2280ef3dab; Hm_lvt_ab6a683aa97a52202eab5b3a9042a8d2=1588905651; CNZZDATA1275796416=871124600-1588903268-%7C1588990372; Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1588994046"}
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400"}
        for url in urls:
            try:
                html = requests.get(url=url.split(" ")[0], headers=header, cookies=cookie)
                soup = BeautifulSoup(html.text, 'html.parser')
                ul = soup.find_all("ul", class_='thrui')[0]
                # print(ul)
                lis = ul.find_all("li")[:]
                for li in lis:
                    # 最后一个li标签不是天气数据
                    div = li.find_all("div")
                    Data_Box.date.append(div[0].text.split("-")[0]+div[0].text.split("-")[1]+div[0].text.split("-")[2])
                    Data_Box.max_temp.append(div[1].text)
                    Data_Box.min_temp.append(div[2].text)
                    Data_Box.weather.append(div[3].text)
                    Data_Box.wind_direction.append(div[4].text.split(" ")[0])
                    Data_Box.wind_force.append(div[4].text.split(" ")[1])
            except:
                print("该页面爬取失败!")

            else:
                print("该页面爬取成功!")
            try:
                for i in range(num, (len(Data_Box.date))):
                    print(str(line.split(" ")[0]),str(line.split(" ")[1]),url.split(" ")[1],Data_Box.date[i],  Data_Box.min_temp[i],Data_Box.max_temp[i], Data_Box.weather[i],
                          Data_Box.wind_direction[i],Data_Box.wind_force[i])
                    ios.cw("weather"+str(min)+".txt",str(line.split(" ")[0])+" "+str(line.split(" ")[1])+" "+Data_Box.date[i]+Data_Box.min_temp[i].split("")[0]+" "+Data_Box.max_temp[i].split("")[0]+" "+Data_Box.weather[i]+" "+
                          Data_Box.wind_direction[i].split("")[0]+" "+Data_Box.wind_force[i].split("")[0]+"
")
                    num=len(Data_Box.date)
            except:
                print("写入失败!")
                ios.cw("Fault.txt",str(line.split(" ")[1]+"写入失败"))
    return "数据获取完毕"


# 爬取程序主函数
if __name__ == "__main__":

    get_datas(2011, 2012)
    get_datas(2012, 2013)
    get_datas(2013, 2014)
    get_datas(2014, 2015)
    get_datas(2015, 2016)
    get_datas(2016, 2017)
    get_datas(2017, 2018)
    get_datas(2018, 2019)
    get_datas(2019, 2020)
    get_datas(2020, 2021)

爬取结果:

原文地址:https://www.cnblogs.com/zlc364624/p/14427792.html