python爬虫学习之爬取全国各省市县级城市邮政编码

实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中

实例环境:python3.7
       requests库(内置的python库,无需手动安装)
       xlwt库(需要自己手动安装)

实例网站:

   第一步,在http://www.ip138.com/post/网站通过查询源代码可以找到各个省份的链接

    

     第二步,点击链接,即可看到所点击省份的城市的邮政编码

    

    

实例代码:    

import requests
import xlwt

# 返回一个字典,键是各个省份的名字,值是对应省份的网址url
def getProvinceCode(url):
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    content = response.text
    start = content.find('<map name="map_86" id="map_86">') + len('<map name="map_86" id="map_86">') + len("
")
    end = content.find('</map>')
    mapStr = content[start:end]
    #print(mapStr)
    lines = mapStr.split("
")
    baseUrl = 'http://www.ip138.com/'
    city_urls = []
    city_name = []
    for line in lines:
        if line:
            index1 = line.find('href="/') + len('href="/')
            index2 = line.find('/"')
            code = line[index1:index2]
            url = baseUrl + code
            city_urls.append(url)
            title1 = line.find('title="')+len('title="')
            title2 = line.find('"', title1)
            title = line[title1:title2]
            city_name.append(title)
    dict_prov_url = dict(zip(city_name,city_urls))
    for item in dict_prov_url.items():  # 显示各个省份名称和对应的url
        print(item)
    return dict_prov_url

# 根据url得到省份的各个城市的城市名、邮政编码以及长途区号,返回一个二维的列表。
def getPostCode(url):
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    content = response.text
    start = content.find('长途区号</b></td></tr>') + len("长途区号</b></td></tr>")
    end = content.find('</table>', start)
    add_post = content[start:end]

    posts = add_post.strip().split('<tr bgcolor="#ffffff">')  # posts为每一个去掉<tr bgcolor="#ffffff">组成的列表
    code_list = []
    for post in posts:
        if post:
            lines = post.strip().split('<td')
            if len(lines) >= 2:
                if 'nbsp' in lines[4]:
                    if len(lines) >= 6:
                        if 'nbsp' in lines[5]:
                            test = []
                            city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')]
                            post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
                            area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
                            test.append(city)
                            test.append(post_code)
                            test.append(area_code)
                            code_list.append(test)
                    else:
                        test = []
                        city = lines[1][lines[1].find('<b>')+len('<b>'):lines[1].find('</')]
                        post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
                        area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
                        test.append(city)
                        test.append(post_code)
                        test.append(area_code)
                        code_list.append(test)
                else :
                    test1 = []
                    city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')]
                    post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
                    area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
                    test1.append(city)
                    test1.append(post_code)
                    test1.append(area_code)
                    code_list.append(test1)
                    test2 = []
                    city = lines[4][lines[4].find('>')+len('>'):lines[4].find('</')]
                    post_code = lines[5][lines[5].find('">')+len('">'):lines[5].find('</')]
                    area_code = lines[6][lines[6].find('">')+len('">'):lines[6].find('</')]
                    test2.append(city)
                    test2.append(post_code)
                    test2.append(area_code)
                    code_list.append(test2)
    showPost(code_list)
    return code_list

# 在终端上显示上面getPostCode(url)函数的得到二维的列表
def showPost(code_list):
    for i in range(len(code_list)):
        print(code_list[i])

# 写入excel文件
def write_excel(path):
    # 创建工作簿
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建sheet
    for title,url in getProvinceCode('http://www.ip138.com/post/').items():
        data_sheet = workbook.add_sheet(title)
        row0 = [u'城市名称', u'邮政编码', u'长途区号']  # 每个表的第一行文字,表头
        for i in range(len(row0)):
            data_sheet.write(0, i, row0[i])
        code_list = getPostCode(url)
        for i in range(len(code_list)):        # 循环写入所有邮政编码信息
            for j in range(len(code_list[i])):
                data_sheet.write(i+1,j,code_list[i][j])

    workbook.save(path)


if __name__ == '__main__':
    path = './postcode.xls'
    write_excel(path)
    print(u'写入postcode.xls文件成功')

实例结果:

  终端显示:

  

   excel文件:

  

原文地址:https://www.cnblogs.com/xiaoyh/p/10011825.html