爬虫入门【5】PyQuery简介

PyQuery

目前最新的版本是1.3，基于最新版本进行介绍。
主要根据PyQuery的官方文档进行了更新。

from pyquery import PyQuery as pq
from lxml import etree
import urllib

doc=pq('<p id="hello" class="hello"></p>')#解析文档的基本方法
p=doc('p')#获取p标签

print(p.attr('id'))#获取p标签的id属性的值
p.attr('id','plop')#改变p标签的id属性的值为plop
print(p.attr('id'))
p.attr.id='snow'#使用python的方式改变id属性
print(p.attr.id)
p.attr['id']='ola'
print(p.attr['id'])
p.attr(id='hello',class_='hello2')
print(p)

hello
plop
snow
ola
<p id="hello" class="hello2"/>

CSS内容

p.add_class('toto')#增加class内容
# print(p)
p.toggle_class('titi toto')#切换class内容
# print(p)
p.remove_class('titi')#移除class
# print(p)
#添加style，以font-size为例
p.css.font_size='16px'
p.css['font-size']='16px'
p.css={'font-size':'17px'}

Manipulating

#向标签内容的尾部添加一些string
p.append(' check out <a href="http://www.baidu.com">百度一下</a>')
#pp()
#也可以向标签的头部添加
p.prepend('hello again! ')
#pp()
#或者使用这两个命令添加到其他文档或者标签的内容中
d=pq('<html><body><div id="test"><a href="http://python.org">python</a> !</div></body></html>')
p.prepend_to(d('#test'))#将p的内容添加到d的id为test的标签里面，占据最开始的位置
#print(d('#test').html())#打印出id为test的标签的内容

p.insert_after(d('#test'))#把p标签放在id为test的标签后面
#print(d('body').html())
p.insert_before(d('#test'))#把p标签插入到id为test的标签前面
#print(d('body').html())#可以看出来，p插入到了id为test的标签的前面

#移除一个元素
d=pq('<html><body><p id="id">Yeah!</p><p>python rocks !</p></div></html>')
d.remove('p#id')#移除了p标签的id属性
#print(d('body').html())

#将标签的内容移除
d('p').empty()
#print(d('p'))

#可以将两个PyQuery对象连接起来：
print(pq('<div>Yeah !</div>').add_class('myclass') + pq('<b>cool</b>'))

Traversing

d = pq('<p id="hello1" class="test1"><a>1</a></p><p id="hello2" class="test2"><a>2</a></p>')

#print(d('p').filter('.test1'))#按类选择，class名为test1的p标签。
#print(d('p').filter('#hello2'))#按id选择，id名为hello2的p标签
#print(d('p').eq(0))#第一个p标签
#print(d('p').eq(1))#第二个p标签

"""Return PyQuery of only the element with the provided index:
    >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
    >>> d('p').eq(0)
    [<p.hello>]
    >>> d('p').eq(1)
    [<p>]
    >>> d('p').eq(2)
    []
"""

#查询嵌套的元素
#print(d('p').find('a'))
#print(d('p').eq(1).find('a'))

API

http://pythonhosted.org/pyquery/api.html
请参考官方文档的介绍。

Scraping

#PyQuery可以使用url载入html文档，默认使用python的urllib库
print(pq('http://www.baidu.com'))
#如果安装了request库，也可以使用并且可以使用requests的大多参数
pq('http://duckduckgo.com/', headers={'user-agent': 'pyquery'})
pq('https://duckduckgo.com/', {'q': 'foo'}, method='post', verify=True)

如果您觉得感兴趣的话，可以添加我的微信公众号：一步一步学Python