使用python爬去国家民政最新的省份代码的程序,requests,beautifulsoup,lxml

使用的python3.6

民政网站,不同年份数据可能页面结构不一致,这点踩了很多坑,这也是代码越写越长的原因。

如果以后此段代码不可用,希望再仔细学习下 页面结构是否发生了变更。

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Wed Jul 10 14:40:41 2019
  4 
  5 @author: Administrator
  6 """
  7 
  8 import pandas as pd
  9 import requests 
 10 from bs4 import BeautifulSoup
 11 import time 
 12 
 13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/'
 14 headers = {'content-type': 'application/json',
 15                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
 16 
 17 # 1. 获取所有链接========================================================================
 18 def f1(url1):
 19     '2018-1980年中华人民共和国行政区划代码 的所有链接'
 20     #requests发出请求,设置url,header参数
 21     response = requests.get(url1, headers=headers, timeout=200, verify=False)
 22     soup = BeautifulSoup(response.text,'lxml') #将网页源码返回为BeautifulSoup类型
 23     _tmp1 = soup.select('td.arlisttd')
 24     end_1 = []
 25     for i in _tmp1:
 26         _a = i.select('a')[0].get('href')
 27         _b = i.select('a')[0].get('title')[:4]
 28         end_1.append(['http://www.mca.gov.cn'+_a,_b])
 29     return end_1
 30 
 31 end_2=[]
 32 for i in ['','?2','?3']:
 33     end_2 = end_2+f1(url1+i)
 34     
 35     
 36 def f2(url1='http://www.mca.gov.cn/article/sj/xzqh/2019/'):
 37     '2019年中华人民共和国行政区划代码'
 38     response = requests.get(url1, headers=headers, timeout=200, verify=False)
 39     soup = BeautifulSoup(response.text,'lxml')
 40     _tmp1 = soup.select('td.arlisttd')
 41     end_1 = []
 42     for i in _tmp1:
 43         _a = i.select('a')[0].get('href')
 44         _b = i.select('a')[0].get('title')[:7]
 45         end_1.append(['http://www.mca.gov.cn'+_a,_b])
 46     return end_1
 47 
 48 end_2 = end_2+f2()
 49 
 50 # 2. 获取数据========================================================================
 51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'):
 52     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml'
 53     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml'
 54     #
 55     response = requests.get(url1, headers=headers, timeout=200, verify=False)
 56     soup = BeautifulSoup(response.text,'lxml')
 57     _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";')
 58     if _txt[-4:]=='html':
 59         print('script!')
 60         url2 = _txt
 61     else:
 62         _tmp1 = soup.select('div.artext > div > p > a')
 63         if len(_tmp1)==0:
 64             _tmp1 = soup.select('div#zoom > a')
 65         url2 = _tmp1[0].get('href')
 66     print(url2)
 67     #return url2
 68     #url2='http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html'
 69     time.sleep(0.5)
 70     response = requests.get(url2, headers=headers, timeout=200, verify=False)
 71     #将网页源码返回为BeautifulSoup类型
 72     soup = BeautifulSoup(response.text,'lxml')
 73     _tmp1 = soup.select('table > tr[height="19"]')
 74     end_1 = []
 75     if len(_tmp1)>5:
 76         for i in _tmp1:
 77             _a = i.select('td')[1].get_text().strip()
 78             if len(_a)>15: #部分数据页面,最后一行是备注。
 79                 continue
 80             else:
 81                 _b = i.select('td')[2].get_text().strip()
 82                 end_1.append([_a,_b])
 83     else:
 84         _tmp1 = soup.select('table > tr[height="20"]')
 85         for i in _tmp1:
 86             _a = i.select('td')[0].get_text().strip()
 87             if len(_a)>15 or _a=='行政区划代码': #部分数据页面,最后一行是备注。
 88                 continue
 89             else:
 90                 _b = i.select('td')[1].get_text().strip()
 91                 end_1.append([_a,_b])
 92     
 93     return end_1
 94 
 95 #循环对每个链接 获取数据
 96 end_3=[];#end_4=[]
 97 for j in range(len(end_2)):
 98     item = end_2[j]
 99     if '19'  in item[1] or '20'  in item[1]:
100         print(j,item[0],item[1])
101         tmp2 = f3(item[0])
102         print('.')
103         end_3.extend([[item[1]]+i for i in tmp2])
104         #end_4.append(tmp2)
105         time.sleep(0.1)
106     
107 df_result = pd.DataFrame(end_3)
108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False)
109 df_result.to_excel('地区编码.xlsx',index=False)
110 
111 
112 '''
113 #3 2019年5月份县以上行政区划代码_3852 > table > tbody > tr:nth-child(4)
114 #list_content > div.list_right > div > ul > table > tbody > tr:nth-child(1) > td.arlisttd > a
115 '''
原文地址:https://www.cnblogs.com/andylhc/p/11490563.html