17 中国天气网信息爬取

 1 """中国天气网爬虫"""
 2 
 3 import requests
 4 from bs4 import BeautifulSoup
 5 
 6 HEADERS = {
 7     'User-Agent': 'Mozilla/5.0',
 8 }
 9 
10 def parse_detail_page(url, is_html5lib):
11     """爬取具体页面具体数据"""
12 
13     respose = requests.get(url, headers=HEADERS)
14     text = respose.content.decode('utf-8')
15     # with open('weather.html', 'w', encoding='utf-8') as fp:
16     #     fp.write(text)
17     if is_html5lib == False:
18         soup = BeautifulSoup(text, 'lxml')
19     else:
20         soup = BeautifulSoup(text, 'html5lib')
21     # 以下为具体爬取数据方法
22     conMidtab = soup.find_all('div', attrs={'class':'conMidtab'})
23     tables = conMidtab[0].find_all('table')
24     for table in tables:
25         trs = table.find_all('tr')[2:]
26         for index,tr in enumerate(trs):
27             tds = tr.find_all('td')
28             city_td = tds[0]
29             if index == 0:
30                 city_td = tds[1]
31             city = list(city_td.stripped_strings)[0]
32             temp_td = tds[-2]
33             min_temp = list(temp_td.stripped_strings)[0]
34             # 输出城市及其最低温度
35             print({'city': city, 'min_temp': min_temp})
36 
37     print("="*40)
38 
39 def get_detail_urls(url, base_url):
40     """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接"""
41 
42     urllists = []       # 具体的页面信息列表
43     respose = requests.get(url, headers=HEADERS)
44     text = respose.content.decode('utf-8')
45     soup = BeautifulSoup(text, 'lxml')
46     # 数据爬取
47     uls = soup.find_all('ul', class_='lq_contentboxTab2')
48     alists = uls[0].find_all('a')
49     for list in alists:
50         newurl = base_url + list['href']
51         urllists.append(newurl)
52 
53     return urllists
54 
55 def spider():
56     """"""
57 
58     # 初始爬取页面
59     src_url = "http://www.weather.com.cn/textFC/hb.shtml"
60     base_url = "http://www.weather.com.cn"
61     urllists = []
62     urllists = get_detail_urls(src_url, base_url)
63     #print(urllists)
64     is_html5lib = False     # 爬取页面是否用html5lib库
65     for index,urllist in enumerate(urllists):
66         if index != len(urllists)-1:
67             parse_detail_page(urllist, is_html5lib)
68         else:
69             is_html5lib = True
70             # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错
71             parse_detail_page(urllist, is_html5lib)
72 
73 if __name__ == '__main__':
74     spider()
原文地址:https://www.cnblogs.com/sruzzg/p/13096959.html