爬虫爬取机场网站的表格

Excel手输估计都比我写爬虫快系列

爬取机场网站出租车费用
https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html

Tag对象

tag.attrs

has_attr()

子节点中，筛选Tag对象和bs4封装的string类型

列表推导式的使用

列表的pop()操作

tag.children迭代器 tag.contents列表

.get_text() .strings 和 .stripped_strings的使用

get_text()作为一个整体字符串，内部的空格不好消除

.strings

stripped_strings 迭代器转换为列表list()

import numpy
import requests
import bs4
headers={'Accept':'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
         }
response=requests.get("https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html",headers=headers)
#print(response.text)
text=response.text
bs=bs4.BeautifulSoup(text,features='lxml')
#print(bs.prettify())
nodes=bs.find_all('tbody')


def has_attr_class(tag):
    return tag.has_attr('class')

def parse_table(node):
    #将子节点为bs4封装的字符串类对象过滤掉
    city=[child for child in node.children if child.string!=' ']
    array=[]
    for tr in city:
        if has_attr_class(tr):
            print("************出租车收费标准***************")
            print("表单字段名字为：")
            #根据tag中的标签，将tag对象中的string多个字符串，变为迭代器
            field=list(tr.stripped_strings)#去除空格和空行
            print(field,'
')
            array.append(field)
            #field=tr.get_text().strip()
        else:
            value=list(tr.stripped_strings)
            #商圈和景点分开
            if len(value)==5:
                info=value.pop(0)
            print(value)
            array.append(value)
    print('----------------------------------------')
    return array

#原表中有两个表
#浦东机场一号航站楼和二号航站楼
import numpy
import requests
import bs4
headers={'Accept':'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
         }
response=requests.get("https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html",headers=headers)
#print(response.text)
text=response.text
bs=bs4.BeautifulSoup(text,features='lxml')
#print(bs.prettify())
nodes=bs.find_all('tbody')


def has_attr_class(tag):
    return tag.has_attr('class')

def parse_table(node):
    #将子节点为bs4封装的字符串类对象过滤掉
    city=[child for child in node.children if child.string!=' ']
    array=[]
    for tr in city:
        if has_attr_class(tr):
            print("************出租车收费标准***************")
            print("表单字段名字为：")
            #根据tag中的标签，将tag对象中的string多个字符串，变为迭代器
            field=list(tr.stripped_strings)#去除空格和空行
            print(field,'
')
            array.append(field)
            #field=tr.get_text().strip()
        else:
            value=list(tr.stripped_strings)
            #商圈和景点分开
            if len(value)==5:
                info=value.pop(0)
            print(value)
            array.append(value)
    print('----------------------------------------')
    return array

#原表中有两个表
#浦东机场一号航站楼和二号航站楼
global i
i=1
for node in nodes:
    array=parse_table(node)
    numpy.savetxt(r"E:360MoveDataUsershzsdlDesktop\%d.txt" % i,array,delimiter=',',fmt='%s')
    i+=1

追风少年