brautiful抓取网页数据

# from urllib import request
from bs4 import BeautifulSoup
#
# req = request.Request("http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab.html?xhbh=ff8080815c04a864015c596c4c177699&xmxh=null&area=00390019&xyghbh=ff80808151561b4701517a3e43825e4f&lastcgsl=0&cgje=0.0&lastcgje=0.0&cgsl=0&isnwwbz=ww&czy=null&lbbs=null")
# req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
# res = request.urlopen(req)
# res = request.urlopen(req)
#
# #print(res.read().decode("utf-8"))
# res = res.read()
#
# # doc = open("aa.html","wb+")
# # doc.write(res)
# soup = BeautifulSoup(res, 'html.parser', from_encoding='utf-8')
# #title = soup.strong
#
# company = soup.find_all(target='_Blank')
#
# print(company)
from urllib import request,parse

login_data = parse.urlencode([
    ('formids','If,sl,jbcsPage,ghsPage,jgqsPage,picPage,spxqPage,Xzsp,Gwc,Xmxx,Dzdd,Ddys,selgys'),
    ('submitmode',''),
    ('submitname',''),
    ('If','F'),
    ('xhbh','ff8080815c04a864015c596c4c177699'),
    ('area','00390019'),
    ('ppmc','联想'),
    ('czy',''),
    ('scjg',4126.0),
    ('zdjg',4126.0),
    ('xyghbh','ff80808151561b4701517a3e43825e4f'),
    ('xmxh',''),
    ('lastcgsl',''),
    ('cgje',0),
    ('lastcgje',0),
    ('cgsl',0),
    ('isnwwbz','ww'),
    ('lbbs',''),
    ('gysdqzdbj','4126.0'),
    ('ghsmc','点击选择供应商'),
    ('sl',0),
    ('ghsPage','供货商'),
])

# 调用request包.Request对象
req = request.Request('http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab,form.sdirect')

req.add_header('Origin', 'http://www.hngp.gov.cn')
req.add_header('Cookie','JSESSIONID=E6738337F2A4BAE45C6127C732DA7D54')
req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
req.add_header('Referer', 'http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab.html?xhbh=ff8080815c04a864015c596c4c177699&xmxh=null&area=00390019&xyghbh=ff80808151561b4701517a3e43825e4f&lastcgsl=0&cgje=0.0&lastcgje=0.0&cgsl=0&isnwwbz=ww&czy=null&lbbs=null')

# 调用request.urlopen对象,请求网址
res = request.urlopen(req,data=login_data.encode('utf-8'))

# 在调用Beautiful对象之前,先读取网页内容
res = res.read()

# 选择用哪一种网页解析器解析读取的网页(选择哪种编码)
soup = BeautifulSoup(res, 'html.parser', from_encoding='utf-8')
tr_list = soup.findAll('tr')

for tr in tr_list:
    td_list = tr_list[1].findAll('td')

    price = td_list[4].getText()
    name  = td_list[5].getText()

    if name != '韦玮' :

        exit()
#print(data)
原文地址:https://www.cnblogs.com/hanshuai0921/p/7903293.html