Python-beautifulsoup库

 1 #beautifulsoup库的安装
 2 pip install beautifulsoup4
 3 python -m pip install --upgrage pip
 4 from bs4 import BeautifulSoup
 5 
 6 #----------------beautifulsoup库的使用--------------------------------------
 7 import requests
 8 from bs4 import BeautifulSoup
 9 url = "http://python123.io/ws/demo.html"
10 r = requests.get(url)
11 # print(r.text)
12 demo = r.text
13 soup = BeautifulSoup(demo,"html.parser") #熬一锅`粥
14 #print(soup.prettify()) #打印这锅粥
15 
16 #下行遍历函数：.contents()  .children()用于循环 .descendants()
17 soup.head #获取head标签
18 soup.head.contents #获取head的子节点，返回类型是列表
19 soup.body.contents #
20 len(soup.body.contents)  #terurn 5
21 soup.body.contents[2]
22 print('以下输出子节点：')
23 for child in soup.body.children:
24     print('##',child)
25 print('以下输出子孙节点：')
26 for child in soup.body.descendants:
27     print('**',child)
28 
29 #---上行遍历 .parent  .parents(用于循环)
30 soup.title.parent  #return  <head><title>This is a python demo page</title></head>
31 soup.html.parents     #返回 html所有内容
32 soup.parent         #返回为空
33 print('以下输出父节点：')
34 for par in soup.a.parents:
35     if par is None:
36         print('$$$',par)
37     else:
38         print('%',par.name)
39 
40 #----平行遍历----
41 # 向后.next_sibling    向前.previous_sibling    加 s 用于遍历
42 #title 与 p标签 不构成平行关系
43 soup.a.next_sibling #return ' and ' 所以<a>标签的下一个标签不一定是<a>标签，需要判断
44 soup.a.next_sibling.next_sibling #return <a ...</a>
45 
46 soup.a.previous_sibling
47 soup.a.previous_sibling.previous_sibling
48 print('以下输出下行遍历：')
49 for sibling in soup.a.next_siblings:
50     print('##',sibling)
51 print('以下输出上行遍历：')    
52 for sibling in soup.a.previous_siblings:
53     print('**',sibling)
转载仅为学习，不会商用。
欢迎转载原创，附文链接。