python爬虫之BeautifulSoup4介绍

CSS 选择器：BeautifulSoup4

例子：

response = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""

四大对象种类

1. Tag

注：

soup = BeautifulSoup(response,"html.parser")

print(soup.p) #查找的是第一个符合要求的标签

2. NavigableString

3. BeautifulSoup

4. Comment

遍历文档树

1. 直接子节点：.contents .children 属性

2. 所有子孙节点: .descendants 属性

3. 节点内容: .string 属性

搜索文档树

1.find_all(name, attrs, recursive, text, **kwargs)

注：

print(soup.find_all("a')) #查找所有的<a>标签

print(soup.find_all(["a","b","p"])) #返回所有的<a>标签、标签和标签

print(soup.find_all(id="link1"))

print(soup.find_all(text="Elsie")) #返回文本等于Elsie的内容

2. CSS选择器

（1）通过标签名查找

print(soup.select('a'))

（2）通过类名查找

print(soup.select('.sister'))

[<a class="sister" href="http://example.com/elsie" id="link1"></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

（3）通过 id 名查找

print(soup.select("#link1"))

#[<a class="sister" href="http://example.com/elsie" id="link1"></a>]

（4）组合查找

print(soup.select('p #link1')) #p标签下ID等于link1的内容，之间空格分开

print(soup.select("head > title")) #直接子标签查找

（5）属性查找

print(soup.select('a[class="sister"])') #属性和标签属于同一节点，所以中间不能加空格

print(soup.select('p a[href="http://example.com/elsie"]'))

(6) 获取内容

print (soup.select('title')[0].get_text())

print(soup.select("title")[0].text)

============================================================================================

name = (film.select("[title]")[0].text)    #肖申克的救赎

===========================================================================================

# -*- coding:utf-8
from bs4 import BeautifulSoup

#BeautifulSoup对象四种类型tag|NavigableString|BeautifulSoap|Comment

response = html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse" title="标题"><b>The Dormouse's story中文</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(response,"html.parser")
#tags;属性======查找的是第一个符合要求的标签
print("soup.a.attrs:",soup.a.attrs)
print("soup.p:",soup.p)
print('soup.p["class"]:',soup.p["class"])
print('soup.p.get("class"):',soup.p.get("class"))

##NavigableString
print('soup.p.string:',soup.p.string)   #获取标签中的文字内容

# BeautifulSoup
#大部分时候,可以把它当作 Tag 对象,可以分别获取它的类型，名称
print(soup.attrs) # 文档本身的属性为空

#Comment
# Comment 对象是一个特殊类型的 NavigableString 对象，其输出的内容不包括注释符号
print(soup.a)
print(soup.a.string)

#遍历文档树
print(soup.head.contents)  #输出方式为列表
print(soup.head.children)  #list生成器对象 获取所有子节点，可以通过遍历获取所有子节点
print("++++++++++++++++")
for child in soup.body.children:
    print(child)
print("===============")
print(soup.select("[title]")[0].text)
# print(soup.find_all(["a","b","p"]))
print(soup.select('title')[0])
print(soup.select('title')[0].text)
print(soup.select('title')[0].get_text())