pyquery库

一、pyquery库

PyQuery是一个类似于jQuery的解析网页工具,使用lxml操作xml和html文档,它的语法和jQuery很像。和XPATH,Beautiful Soup比起来,PyQuery更加灵活,提供增加节点的class信息,移除某个节点,提取文本信息等功能,pyquery和BeautifulSoup都是用来解析html的库,但是pyquery的CSS选择器更加强大。

安装:

pip install pyquery

1、URL初始化,通过网址初始化pyquery对象

1 from pyquery import PyQuery as pq
2 
3 with open("./html.html","r",encoding="utf-8") as f:
4     html = f.read()
5 
6 # 通过网页初始化pyquery对象
7 doc = pq(html)
8 print(type(doc)) #<class 'pyquery.pyquery.PyQuery'>

二、CSS选择器

1、查找元素-子元素

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li><a href="/">首页</a></li>
 5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
 7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 # 字符串初始化,得到pyquery对象
13 doc = pq(html)
14 items = doc(".nav")
15 list = items.find("li") 
16 child = items.children("li") #子元素

2、查找元素-父元素

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li><a href="/">首页</a></li>
 5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
 7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 # 查找父元素
13 doc = pq(html)
14 items = doc.find(".nav")
15 parent = items.parent(".collapse")
16 print(parent)

 3、查找元素-兄弟元素

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 # 查找兄弟元素
13 doc = pq(html)
14 li  = doc(".navbar-nav .item-0.active") #选择器后面不加空格,表示并列,即获取item-0又满足active
15 print(li.siblings()) #获取除了课程建设标签之外的其他兄弟标签

 5、遍历-多个元素

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 element = doc("li").items() #获取多个元素,items()生成一个产生器
15 print(type(element)) #<class 'generator'>
16 for ele in element:
17     print(ele)

三、获取信息

1、获取标签属性--attr()

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 element = doc(".item-0.active a")
15 print(element.attr("name")) #result: course

2、获取文本--text()

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 element = doc(".item-0.active a")
15 print(element.text()) #result:课程建设

3、获取HTML

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 li = doc(".item-0.active")
15 print(li.html()) #result:<a href="/v2.0/doc/app02.html" name="course">课程建设</a>

四、DOM操作

1、增加/删除属性---addClass、removeClass

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 li = doc(".item-0.active")
15 print(li) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
16 
17 remove_attr = li.removeClass("active")
18 print(remove_attr) #result:<li class="item-0"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
19 
20 add_attr = li.addClass("active")
21 print(add_attr) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>

 2、修改标签属性、修改样式---attr、css

 1 html = """
 2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
 3         <ul class="nav navbar-nav">
 4             <li class="item-0"><a href="/">首页</a></li>
 5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
 6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
 7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
 8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
 9         </ul>
10     </div>"""
11 
12 from pyquery import PyQuery as pq
13 doc = pq(html)
14 li = doc(".item-0.active a")
15 
16 update_attr = li.attr("name","cour") #若该标签不存在name属性则新增,若存在则修改该属性
17 print(update_attr) #result:<a href="/v2.0/doc/app02.html" name="cour">课程建设</a>
18 
19 css_tag = li.css("font-size","14px")
20 print(css_tag) #result:<a href="/v2.0/doc/app02.html" name="cour" style="font-size: 14px">课程建设</a>

 3、remove()

 1 html_test = """
 2     <div class="rem">
 3         www.baidu.com,百度一下
 4         <p>《大秦赋》</p>
 5     </div>
 6 """
 7 
 8 from pyquery import PyQuery as pq
 9 doc = pq(html_test)
10 # 只需要获取p标签上面一句话
11 element = doc(".rem")
12 ele = element.find("p").remove() # 找到p标签并移除
13 print(element.text()) #在获取文本
原文地址:https://www.cnblogs.com/yzmPython/p/14103633.html