pyquery解析库

语法和jquey几乎一致

安装

conda install pyquery

一、初始化

标准用法

from pyquery import PyQuery as pq
import requests

#
r = requests.get(url='http://www.baidu.com')

html_doc = pq(r.text)
print(html_doc)
print(html_doc('#u1 a'))

1、字符串初始化(最常用)

from pyquery import PyQuery as pq

html_doc = '''<div>
    <ul id = 'haha'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
print(doc)
print(type(doc))

2、url初始化

from pyquery import PyQuery as pq

#


html_doc = pq(url='http://www.baidu.com')
print(html_doc)
print(html_doc('#u1 a'))

注意:一般通过requests模块或urllib获取网页的html->解析模块去解析

3、文件初始化

from pyquery import PyQuery as pq

#


doc = pq(filename='test.html')
print(doc)

二、基本CSS选择器

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
li_list = doc('div #con li')
print(li_list)

# id      #
# class  .
# tag    tagname

三、查找节点

1、子节点

find() 最常用的方法

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
div = doc('div')
li_list = div.find('li.active')
print(li_list)

children() 查找所有子节点,children('') 查找指定的子节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
div = doc('div')
# 查找所有子节点
selector = div.children()
print(selector)
# 查找含有item-0类的节点
li_item_0 = div.children('#con .item-0')
print(li_item_0)

2、父节点

parent() 父节点 parents() 祖节点 parents('') 含有某些选择器祖节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# s所有li节点
li_list = doc('#con li')
# li节点的父节点
ul = li_list.parent()
# print(ul)
# 祖辈节点(包含父节点)
divs = li_list.parents()
# print(divs)
# 含有id="conn" 的祖节点
div = li_list.parents('#con')
print(div)

3、兄弟节点

siblings() 所有兄弟姊妹节点,siblings('') 含有指定css选择器的兄弟节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 含有class="item-0 active"的节点
li = doc('#con li.item-0.active')
# 查找所有兄弟节点(除了自己本身)
# print(li.siblings())
# 查找含有指定css选择器的节点
print(li.siblings('.item-1.active'))

四、遍历

1、单个节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 单个节点
li = doc('#con li.item-0.active')
print(li)

2、多个节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 多个节点,使用items()->生成器
li_lst = doc('#con li')
for li in li_lst.items():
    print(li, end='')

五、获取信息

1、属性

获取 设置

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 获取a标签的href属性
a = doc('li.item-0.active a')
print(a.attr('href'))
# 设置属性
a.attr('href', 'oj8k')
print(a.attr('href'))

2、文本

text() html()

获取 设置

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 获取text()
li = doc('li.item-0.active')
print(li.text())
# 获取html()
print(li.html())

# 设置text()
li.text('Hello World')
print(li.text())
# 设置html()
li.html('<a>打我</a>')
print(li.html())

注意:与JQuery的区别,pyquery(),  html() 获取的是内部的html,不包含其本身

六、操作DOM节点

1、add_class()和remove_class()c

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 获取text()
li = doc('li.item-0.active')
print(li.text())
# 获取html()
print(li.html())

# 设置text()
li.text('Hello World')
print(li.text())
# 设置html()
li.html('<a>打我</a>')
print(li.html())

2、remove()

作用:删除节点

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 获取li节点
li = doc('li.item-0.active')
print(li)
# 找到a节点,并删除a节点
a = li('a')
a.remove()
print(li)

七、伪类选择器

from pyquery import PyQuery as pq

#

html_doc = '''<div>
    <ul id = 'con'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>'''

doc = pq(html_doc)
# 获取li节点
li = doc('li.item-0.active')
print(li)
# 找到a节点,并删除a节点
a = li('a')
a.remove()
print(li)
原文地址:https://www.cnblogs.com/wt7018/p/11904944.html