pyquery解析库

pyquery简介

  如果你对CSS选择器很熟的话则可以使用它,PyqueryBeautiful Soup一样,需要传入一个HTML 文本来初始化pQuery对象,而且它的初始化有多种方式,例如直接传入字符长、URL、文件名等、

pyquery基本用法

字符串初始化

html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0" active><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1" active><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
print(doc('li'))

结果
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1" active=""><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
View Code

URL 初始化

from pyquery import PyQuery as pq

doc = pq(url='https://cuiqingcai.com')
print(doc('title'))

结果
<title>静觅丨崔庆才的个人博客</title>&#13;
View Code

文件初始化

from pyqurey import PyQuery as pq
#本地文件,解析内容为HTML字符串
doc = pq(filename='demo.html')
print(doc(li))
View Code

CSS选择器用法

基础用法

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''

from pyquery import PyQuery as pq

#初始化pyquery对象
doc = pq(html)
#选取id为container的节点,再取齐内部class为list的节点内部所有的li节点
print(doc('#container .list li'))
print(type(doc('#container .list li')))

结果
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1" active=""><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<class 'pyquery.pyquery.PyQuery'>
View Code

节点查找

查找子节点

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
#查找所有子孙节点
from pyquery import PyQuery as pq

doc = pq(html)
#选取class为list的节点
items = doc('.list')
print(type(items))
print(items)
#调用find()方法(查找范围为所有子孙节点)传入CSS选择器,选取内部的所有li的节点
lis = items.find('li')
print(type(lis))
print(lis)

结果
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1" active=""><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1" active=""><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>

#查找子节点
from pyquery import PyQuery as pq

doc = pq(html)
items = doc('.list')
lis = items.children('.active')
print(lis)

结果
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
View Code

查找父节点

父节点:parent()方法获取某个节点的父节点
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''

#获取直接父节点即不会再查找父节点的父节点
from pyquery import PyQuery as pq

doc = pq(html)
#选取class为list的节点
items = doc('.list')
#获取其父节点且其类型依然为PyQuery类型
container = items.parent()
print(type(container))
print(container)

结果
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>

#获取祖先节点即所有的父类节点
from pyquery import PyQuery as pq

doc = pq(html)
#选取class为list的节点
items = doc('.list')
#获取所有祖先点且其类型依然为PyQuery类型,若要筛选具体的节点例如 warp节点 items.parents(‘.warp’)
parents = items.parents()
print(type(parents))
print(parents)

结果
<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div><div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
View Code

兄弟节点(同级节点)

#siblings()方法获取兄弟节点
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''

from pyquery import PyQuery as pq

doc = pq(html)
#选取class为list的节点内部class为item-0和active的节点,即第三个li节点
li = doc('.list .item-0.active')
print(li.siblings())

结果
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>

选取某个具体的兄弟节点,向siblings方法传入CSS选择器

from pyquery import PyQuery as pq

doc = pq(html)
#选取class为list的节点内部class为item-0和active的节点,即第三个li节点
li = doc('.list .item-0.active')、
#筛选class为active的节点
print(li.siblings('.active'))

结果
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
View Code

遍历(pyquery选择的节点可能是单个节点也可能是多个节点,但是其类型都是PyQuery类型)

html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''


单个节点直接打印输出或转成字符串
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('.item-0.active')
#直接打印输出
print(li)
#节点转成字符串
print(str(li))

结果
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

多个节点需要通过item()方法遍历来获取
from pyquery import PyQuery as pq

doc = pq(html)
#通过调用items()方法循环打印输出
lis = doc('li').items()
print(type(lis))
for li in lis:
    print(li,type(li))

结果
<class 'generator'>
<li class="item-0">first item</li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
 <class 'pyquery.pyquery.PyQuery'>
View Code

信息获取

html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''

#属性获取
from pyquery import PyQuery as pq

doc = pq(html)
#获取class为item-0和active的li节点内的阿a节点,类型为PyQuery
a = doc('.item-0.active a')
print(a,type(a))
#调用attr()方法传入属性名称获取属性
print(a.attr('href'))
#通过调用attr属性来获取属性
print(a.attr.href)

结果
<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
link3.html
link3.html

#获取文本
from pyquery import PyQuery as pq

doc = pq(html)
#获取class为item-0和active的li节点内的阿a节点,类型为PyQuery
a = doc('.item-0.active a')
print('text:')
print(a,type(a))
#调用text()方法获取a节点内部文本信息即只返回纯文本内容
print(a.text())
#选中第三个li节点
li = doc('.item-0.active')
print('HTML:')
print(li)
#获取li节点内部的HTML文本,调用html()方法,返回结果为li节点内部的所有HTML文本
print(li.html())

结果
text:
<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
third item
HTML:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<a href="link3.html"><span class="bold">third item</span></a>

*当选取多个节点时html()与text()方法的差异
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('li')
print(li.html())
print(li.text())
print(type(li.text()))

结果
first item
first item second item third item fourth item fifth item
<class 'str'>

*选中结果有多个li节点html()方法返回第一节点,text()方法返回所有li 节点内部的纯文本
View Code

节点操作(addClass()与removeClass()动态改变节点class属性)

ddClass()与removeClass()动态改变节点class属性

html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''



#addClass()与removeClass()动态改变节点class属性
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('.item-0.active')
print(li)
#调用removeClass()方法将li节点的active这个class移除
li.removeClass('active')
print(li)
#调用addClass()方法将class添加回来
li.addClass('active')
print(li)

结果
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
View Code

attr、text、html改变节点属性以及节点内部的内容

html='''
<ul class='list'>
<li class="item-0 active"><a href="link3.html"><span class="blod">third item</spam></a></li>
</ul>'''

from pyquery import PyQuery as pq

doc = pq(html)
li = doc('.item-0.active')
print(li)
#attr()方法修改属性,第一个参数为属性名第二个参数为属性值
li.attr('name','link')
print(li)
#text()改变节点内容
li.text('changed item')
print(li)
#html()方法改变节点内容
li.html('<span>changed item</span>')
print(li)

结果
<li class="item-0 active"><a href="link3.html"><span class="blod">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="blod">third item</span></a></li>
<li class="item-0 active" name="link">changed item</li>
<li class="item-0 active" name="link"><span>changed item</span></li>
View Code

节点文本内容移除(remove()方法

html = '''
<div class="warp">
Hello World
<p>This is a paragraph.</p>
</div>
'''

from pyquery import PyQuery as pq

doc = pq(html)
warp = doc('.warp')
print(warp.text())
#去掉p节点内部的文本
warp.find('p').remove()
print(warp.text())

结果
Hello World
This is a paragraph.
Hello World
View Code

 

 

 

 

posted @ 2018-12-25 21:38  Coolc  阅读(93)  评论(0编辑  收藏  举报