BeautifulSoup解析库

html ="""                                                    2 <!DOCTYPE html>
  3 <html>
  4     <head>
  5         <meta charset = "utf-8">
  6         <title>this is a Demo</title>
  7     </head>
  8     <body>
  9             <div id = "container">
 10                 <div class = "wrapper" bit = "id">
 11                     <h2 class - "title">Hello World</h2>
 12                     <p class = "text">Hello,this is a parag    raph.</p>
 13             </div>
 14                 </div>
 15             <a href = "www.baidu.com">百度</a>
 16     </body>
 17 </html>
 18 """
 19 from bs4 import BeautifulSoup
 20 #2实例化BeautifulSoup对象
 21 soup = BeautifulSoup(html,"lxml")
 22 divs = soup.find_all("div")
 23 print(type(divs))
 24 #输出结果为
 25 #<class 'bs4.element.ResultSet'>
 26 #输出的结果是列表类型,每个div作为列表中的一个元素
 27 #bs4提取的结果不是列表,而是通过print解析出了一个列表结果。
 28 print(divs)
 29 for div in divs:
 30     print(div)
 31     print("*"*60)
 32 #获取指定的标签
 33 div1 = soup.find_all("div")[1]
 34 print(div1)
 35 print("#"*60)
 36 #对节点进行切片筛选
 37 div_select = list(soup.find_all("div"))[1:2]#选取第二个div
 38 for div in div_select:
 39     print(div)
 40     print("*"*60)
 41 #获取指定属性的标签
#方法一
 43 divs_attribute = list(soup.find_all("div",id = "container")    )
 44 print(divs_attribute)
 45 print("*"*60)
 46 #方法二,用一个字典将属性
 47 divs_attribute = list(soup.find_all("div",attrs = {"id":"co    ntainer"}))
 48 print(divs_attribute)
 49 #获取多个指定属性的标签
 50 #如果遇见属性的名字与python的关键字重复则需要在属性名称后加
    上下划线eg:class_
 51 
 52 divs = soup.find_all("div",class_ = "wrapper",bit = "id")
 53 print(divs)
 54 print("#"*60)
 55 #获取标签的属性值
 56 a = soup.find_all("a")[0]
 57 #方法一:通过下表方式提取
 58 href = a["href"]
 59 print(href)
 60 #方法二:利用attrs参数提取
 61 href = a.attrs["href"]
 62 print(href)
 63 #获取标签文本                                              
 64 #方法一
 65 inf = a.string
 66 print(inf)
 67 #方法二,利用strings获取标签下的所有文本
 68 inf = list(a.strings)
 69 print(inf[0])
 70 """
 71 .stripped_strings可以将内容保存,并清除空格换行等没有意义的
    内容
 72 """

代码运行结果

<class 'bs4.element.ResultSet'>
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>, <div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>]
<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>
************************************************************
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
************************************************************
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
############################################################
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
************************************************************
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>]
************************************************************
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>]
[<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>]
############################################################
www.baidu.com
www.baidu.com
百度
百度

笨鸟先飞
原文地址:https://www.cnblogs.com/zoutingrong/p/13826435.html