pyquery的使用
pyquery的使用
1.安装pyquery
使用pip3 install pyquery
2. 初始化
在使用pyquery库解析HTML文本的时候,需要先将其初始化为一个PyQuery对象
- 字符串初始化
from pyquery import PyQuery as pq
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('li'))
- URL初始化
from pyquery import PyQuery as pq
doc = pq(url='https://www.baidu.com')
print(doc('title'))
- 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='./baidu.html')
print(doc('title'))
3.基本CSS选择器
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('#container .list li'))
print(type(doc('#container .list li')))
for doc in doc('#container .list li').items():
print(doc.text())
上述的代码的作用是寻找id为container的节点下属性值为list的节点下所有的li节点
然后通过遍历的方式将获取到的li节点中的文本输出
4.查找节点
- 子节点
查找子节点时,需要用到find方法,其参数是CSS选择器
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
items = doc('.list')
print(items)
li = items.find('li')
print(li)
上述的代码就是找出属性值为list下的所有节点,然后找出该节点下的所有li子节点
find方法查找的是所有的子孙节点,如果只想查找子节点,使用children
li = items.children()
print(li)
如果要对筛选出来的子节点根据属性值进行筛选
li = items.children('.active')
print(li)
- 父节点
使用parent方法来获取某个节点的父节点
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
item = doc('.list')
parent = item.parent()
print(parent)
上述代码就是查找属性值为list的节点的父节点
parent获得节点是直接父节点
如果要获取某个祖先节点,使用parents
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
item = doc('.list')
parent = item.parents()
print(parent)
如果想要筛选某个祖先节点,可以向parents方法传入CSS选择器
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
item = doc('.list')
parent = item.parents('.wrap')
print(parent)
- 兄弟节点
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
item = doc('.list .item-0.active') # 选择class属性值为item-0 active的li节点
print(item.siblings()) # 查找刚才选择的节点的兄弟节点
------
doc = pq(html)
item = doc('.list .item-0.active')
print(item.siblings('.item-0 a')) # 如果要选择特定的兄弟节点,可以在siblings后面添加CSS选择器
5.遍历节点
- pyquery库的选择结果可能是多个节点,也可能是单个节点,类型都是PyQuery类型的,但是返回的结果不是列表形式的
- 如果是单个节点,就直接可以输出,也可以转变成字符串格式
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('.item-0.active') # 匹配出class属性为item-0.active的li节点
print(type(li)) # 选择出来的只有一个节点,所以直接打印出来
print(li)
li = doc('.item-0') # 匹配出class属性中含有item-0的li节点
print(type(li))
print(li)
for li in li.items(): # 由于筛选出来的不是一个节点,因此需要调用items来遍历打印
print(li)
- 获取信息
获取信息主要是获取属性和获取文本
获取属性:使用attrs方法获取
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
a = doc('.item-0.active a')
print(a.attr('href'))
print(a.attr.href)
b = doc('.list li')
print(b.attr('class')) # 选择出来多个节点后,使用attr来获取属性只返回一个结果,这个结果是第一个节点的属性值,
# 因此需要使用遍历来获取多个节点的属性值
for bb in b.items():
print(bb.attr.class_)
获取文本:获取节点后的主要操作就是获取文本,使用text方法来获取
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
c = doc('.item-1.active a')
print(c.text()) # 获取class属性值为item-1.active后的a标签的文本
d = doc('.item-0.active')
print(d.html()) # 获取class属性值为item-0 active的节点的html文本
e = doc('li a')
print(e.text()) # 选择多个节点获取文本时,使用text方法直接将所有节点的文本输出
f = doc('ul')
for ff in f.items():
print(ff.html()) # 如果获取的结果是多个节点,就需要使用循环来输出每个节点的HTML内部文本
6.节点操作
- addClass和removeClass
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.removeClass('active')) # 移除active属性值
print(li.addClass('active')) # 添加active属性值
- attr, text,html
除了使用addClass和removeClass对属性进行操作外,使用attr方法也可以对属性进行操作,此外text和html方法可以改变节点内部的内容
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('.item-1.active') # 选中一个li节点
print(li)
li.attr('class', 'aaaa') # 修改属性class的值
print(li)
li.attr('name', 'bbbb') # 新增一个属性名为name的属性以及属性值
print(li)
li.text('cccc') # 传入文本后,li节点内部的文本全被更改为传入的字符串
print(li)
li.html('<span>dddd</span>') # 传入HTML文本后,li节点内部变成了传入的HTML文本
print(li)
attar方法只传入一个参数,表示获取这个属性的值
attar方法传入两个参数,则表示修改属性值,如果没有这个属性会新增这个属性
text和html方法如果不传入参数,表示的是获取节点内的纯文本和HTML文本,如果传入参数,则表示进行赋值
- remove
remove方法的作用是移除
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text()) # 输出的文本包含p节点中的文本
wrap.find('p').remove() # 首先选中p节点,然后调用remove方法将其移除
print(wrap.text())
7.伪类选择器
使用的是css3伪类选择器
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('li:first-child') # 选中第一个li节点
print(li)
li = doc('li:last-child') # 选中最后一个li节点
print(li)
li = doc('li:nth-child(2)') # 选中第二个li节点
print(li)
li = doc('li:gt(1)') # 选中第二个之后的所有li节点
print(li)
li = doc('li:nth-child(2n)') # 选中偶数个li节点
print(li)
li = doc('li:contains(second)') # 选中包含second文本的li节点
print(li)