Python爬虫bs4解析实战

1.常用方法

from bs4 import BeautifulSoup

html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
    <tr class="h">
        <td class="l" width="374">职位名称</td>
        <td>职位类别</td>
        <td>人数</td>
        <td>地点</td>
        <td>发布时间</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45021&keywords=python&tid=0&lid=0">22989-腾讯云计费PHP高级开发工程师</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45005&keywords=python&tid=0&lid=0">25663-腾讯云高级后台开发(互联网业务)(北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>北京</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45007&keywords=python&tid=0&lid=0">TEG06-云计算架构师(深圳)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44980&keywords=python&tid=0&lid=0">PCG04-PCG研发部数据科学家(深圳/北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44981&keywords=python&tid=0&lid=0">PCG04-PCG研发部业务运维工程师(深圳)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44971&keywords=python&tid=0&lid=0">23674-腾讯新闻大数据分析工程师(北京)</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>北京</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44964&keywords=python&tid=0&lid=0">TEG05-高级数据挖掘工程师(深圳)</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44968&keywords=python&tid=0&lid=0">PCG01-QQ后台推荐算法工程师</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44969&keywords=python&tid=0&lid=0">PCG01-QQ后台大数据开发工程师</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44952&keywords=python&tid=0&lid=0">22989-腾讯云AI产品高级咨询顾问(深圳北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
</table>    
"""

soup = BeautifulSoup(html, "lxml")
# 1.找到所有的tr标签
# trs = soup.find_all("tr")
# 2.找到第二个tr标签,limit表示找到个数,在列表层面获取具体标签
# tr = soup.find_all("tr", limit=2)[1]
# 3.找到所有class等于even的tr标签,class关键字冲突,加下划线
# trs = soup.find_all("tr", class_="even")
# 4.attrs属性可添加多个,以key-value形式
# trs = soup.find_all("tr", attrs={"class": "even"})
# 5.将所有a标签有target属性的找到,可以添加多个关键字参数
# aList = soup.find_all("a", target="_blank")
# 6.获取所有的a标签的href属性
# aList = soup.find_all("a")
# for a in aList:
    # 1.通过下标操作的方式
    # href = a["href"]
    # 2.通过attrs属性的方式
    # href = a.attrs["href"]
# 获取所有的职位信息,过滤掉第一个
trs = soup.find_all("tr")[1:]
jobs = []
for tr in trs:
    job = {}
    # tds = tr.find_all("td")
    # title = tds[0].string
    # category = tds[1].string
    # nums = tds[2].string
    # city = tds[3].string
    # pubtime = tds[4].string
    # job["title"] = title
    # job["category"] = category
    # job["nums"] = nums
    # job["city"] = city
    # job["pubtime"] = pubtime
    # jobs.append(job)
    # 获取所有文本
    infos = list(tr.stripped_strings)
    job["title"] = infos[0]
    job["category"] = infos[1]
    job["nums"] = infos[2]
    job["city"] = infos[3]
    job["pubtime"] = infos[4]
    jobs.append(job)
print(jobs)
View Code

2.css选择器方法

# 1.获取所有tr标签
# trs = soup.select("tr")
# 2.获取第二个tr标签
# tr = soup.select("tr")[1]
# 3.获取所有class是even的tr标签
# trs = soup.select("tr.even")
# trs = soup.select("tr[class='even']")
# 4.获取所有a标签的href属性
# aList = soup.select("a")
# for a in aList:
#     print(a["href"])
# 5.将所有的职位信息提取出来
# trs = soup.select("tr")
# for tr in trs:
#     infos = list(tr.stripped_strings)
#     print(infos)
View Code
from bs4 import BeautifulSoup

html = """
<div>
<!--我是div-->
</div>    
"""

# 本质上是一个tag类型,生成一个tag实例对象,调用tag的方法
soup = BeautifulSoup(html, "lxml")
div = soup.find("div")
print(type(div))    # <class 'bs4.element.Tag'>
# string打印标签下的直接子元素,隔行显示不能打印
print(div.string)
# contents打印标签下的所有元素,返回一个列表
print(div.contents)
# children打印标签下的所有元素,返回一个迭代器
print(div.children)

3.爬取中国天气网并图文显示

"""中国天气网爬取并视图显示最低气温城市"""
import requests
from bs4 import BeautifulSoup
from pyecharts import Bar

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"}
ALL_DATA = []


def detail_urls(url):
    rep = requests.get(url=url, headers=HEADERS)
    text = rep.content.decode(encoding="utf-8")
    # 港澳表格标签残缺需要补缺能力强的html5lib补齐表格标签
    soup = BeautifulSoup(text, "html5lib")
    # 找到第一个属性为conMidtab的div标签
    commidtab = soup.find("div", class_="conMidtab")
    # 找到这个div下的所有table
    tables = commidtab.find_all("table")
    # 循环每一个table
    for table in tables:
        # 排除介绍部分
        trs = table.find_all("tr")[2:]
        # 省份和直辖市两种情况
        for index, tr in enumerate(trs):
            tds = tr.find_all("td")
            city_td = tds[0]
            if index == 0:
                city_td = tds[1]
            # 获取所有文本并去掉空格
            city = list(city_td.stripped_strings)[0]
            min_temp_td = tds[-2]
            min_temp = list(min_temp_td.stripped_strings)[0]
            max_temp_td = tds[-5]
            max_temp = list(max_temp_td.stripped_strings)[0]
            ALL_DATA.append({"city": city, "min_temp": int(min_temp), "max_temp": int(max_temp)})


def spider():
    base_url = "http://www.weather.com.cn/textFC/{}.shtml"
    # 页数较少所以直接拿
    address = ["hb", "db", "hd", "hz", "hn", "xb", "xn", "gat"]
    for i in range(len(address)):
        url = base_url.format(address[i])
        # 将生成的传递给页面解析函数
        get_detail_urls = detail_urls(url)
    ALL_DATA.sort(key=lambda data: data["min_temp"])
    datas = ALL_DATA[0:10]
    cities = list(map(lambda x: x["city"], datas))
    min_temp = list(map(lambda x: x["min_temp"], datas))
    max_temp = list(map(lambda x: x["max_temp"], datas))
    bar = Bar("中国最低气温排行榜")
    bar.add("最低气温", cities, min_temp, mark_line=["average"], mark_point=["max", "min"])
    bar.add("最高气温", cities, max_temp, mark_line=["average"], mark_point=["max", "min"])
    bar.render("temperature.html")


if __name__ == '__main__':
    spider()
View Code

4.总结

"""由于网络的不确定性,要保持一个程序的正常运行就得在代码中处理好
各种可能会发生的异常,以确保程序正常运行"""
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def getTitle(url):
    try:
        # 请求相关的错误,比如请求不到网页
        html = urlopen(url)
        # 进行捕捉,并返回友好形式
    except HTTPError as e:
        return None
    try:
        """print(html)
        from http.client import HTTPResponse
        调用HTTPResponse的read()方法,返回bytes类型数据
        print(type(html.read()))
        pycharmIDE 命令ctrl+b 进入BeautifulSoup源码,查看所需参数,
        第一个为请求返回结果,第二个为解析返回数据的解析器,可选择lxml,html5lib等解析器"""
        htmlTag = BeautifulSoup(html.read(), "html.parser")
        # 标签选择器,选择h1标签
        title = htmlTag.body.h1
    except AttributeError as e:
        # 页面可能没有这个标签属性,进行捕捉,并返回友好形式
        return None
    # 函数运行成功,返回执行结果
    return title

# 调用执行函数,获得返回结果
title = getTitle("http://jandan.net/")
# 判断返回结果的类型,根据结果类型做处理
if title == None:
    print("Title could not be found")
else:
    # 打印成功执行结果
    print(title)
View Code

 map()函数简介: https://www.cnblogs.com/superxuezhazha/p/5714970.html

原文地址:https://www.cnblogs.com/Guishuzhe/p/9835859.html