pyquery解析库
pyquery简介
如果你对CSS选择器很熟的话则可以使用它,Pyquery同Beautiful Soup一样,需要传入一个HTML 文本来初始化pQuery对象,而且它的初始化有多种方式,例如直接传入字符长、URL、文件名等、
pyquery基本用法
字符串初始化
html = ''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0" active><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1" active><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('li')) 结果 <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1" active=""><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
URL 初始化
from pyquery import PyQuery as pq doc = pq(url='https://cuiqingcai.com') print(doc('title')) 结果 <title>静觅丨崔庆才的个人博客</title>
文件初始化
from pyqurey import PyQuery as pq #本地文件,解析内容为HTML字符串 doc = pq(filename='demo.html') print(doc(li))
CSS选择器用法
基础用法
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq #初始化pyquery对象 doc = pq(html) #选取id为container的节点,再取齐内部class为list的节点内部所有的li节点 print(doc('#container .list li')) print(type(doc('#container .list li'))) 结果 <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1" active=""><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <class 'pyquery.pyquery.PyQuery'>
节点查找
查找子节点
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' #查找所有子孙节点 from pyquery import PyQuery as pq doc = pq(html) #选取class为list的节点 items = doc('.list') print(type(items)) print(items) #调用find()方法(查找范围为所有子孙节点)传入CSS选择器,选取内部的所有li的节点 lis = items.find('li') print(type(lis)) print(lis) 结果 <class 'pyquery.pyquery.PyQuery'> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1" active=""><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> <class 'pyquery.pyquery.PyQuery'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0" active=""><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1" active=""><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> #查找子节点 from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') lis = items.children('.active') print(lis) 结果 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li>
查找父节点
父节点:parent()方法获取某个节点的父节点 html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' #获取直接父节点即不会再查找父节点的父节点 from pyquery import PyQuery as pq doc = pq(html) #选取class为list的节点 items = doc('.list') #获取其父节点且其类型依然为PyQuery类型 container = items.parent() print(type(container)) print(container) 结果 <class 'pyquery.pyquery.PyQuery'> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> #获取祖先节点即所有的父类节点 from pyquery import PyQuery as pq doc = pq(html) #选取class为list的节点 items = doc('.list') #获取所有祖先点且其类型依然为PyQuery类型,若要筛选具体的节点例如 warp节点 items.parents(‘.warp’) parents = items.parents() print(type(parents)) print(parents) 结果 <class 'pyquery.pyquery.PyQuery'> <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div><div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
兄弟节点(同级节点)
#siblings()方法获取兄弟节点 html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) #选取class为list的节点内部class为item-0和active的节点,即第三个li节点 li = doc('.list .item-0.active') print(li.siblings()) 结果 <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0">first item</li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> 选取某个具体的兄弟节点,向siblings方法传入CSS选择器 from pyquery import PyQuery as pq doc = pq(html) #选取class为list的节点内部class为item-0和active的节点,即第三个li节点 li = doc('.list .item-0.active')、 #筛选class为active的节点 print(li.siblings('.active')) 结果 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
遍历(pyquery选择的节点可能是单个节点也可能是多个节点,但是其类型都是PyQuery类型)
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' 单个节点直接打印输出或转成字符串 from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') #直接打印输出 print(li) #节点转成字符串 print(str(li)) 结果 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 多个节点需要通过item()方法遍历来获取 from pyquery import PyQuery as pq doc = pq(html) #通过调用items()方法循环打印输出 lis = doc('li').items() print(type(lis)) for li in lis: print(li,type(li)) 结果 <class 'generator'> <li class="item-0">first item</li> <class 'pyquery.pyquery.PyQuery'> <li class="item-1"><a href="link2.html">second item</a></li> <class 'pyquery.pyquery.PyQuery'> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <class 'pyquery.pyquery.PyQuery'> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <class 'pyquery.pyquery.PyQuery'> <li class="item-0"><a href="link5.html">fifth item</a></li> <class 'pyquery.pyquery.PyQuery'>
信息获取
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' #属性获取 from pyquery import PyQuery as pq doc = pq(html) #获取class为item-0和active的li节点内的阿a节点,类型为PyQuery a = doc('.item-0.active a') print(a,type(a)) #调用attr()方法传入属性名称获取属性 print(a.attr('href')) #通过调用attr属性来获取属性 print(a.attr.href) 结果 <a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'> link3.html link3.html #获取文本 from pyquery import PyQuery as pq doc = pq(html) #获取class为item-0和active的li节点内的阿a节点,类型为PyQuery a = doc('.item-0.active a') print('text:') print(a,type(a)) #调用text()方法获取a节点内部文本信息即只返回纯文本内容 print(a.text()) #选中第三个li节点 li = doc('.item-0.active') print('HTML:') print(li) #获取li节点内部的HTML文本,调用html()方法,返回结果为li节点内部的所有HTML文本 print(li.html()) 结果 text: <a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'> third item HTML: <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <a href="link3.html"><span class="bold">third item</span></a> *当选取多个节点时html()与text()方法的差异 from pyquery import PyQuery as pq doc = pq(html) li = doc('li') print(li.html()) print(li.text()) print(type(li.text())) 结果 first item first item second item third item fourth item fifth item <class 'str'> *选中结果有多个li节点html()方法返回第一节点,text()方法返回所有li 节点内部的纯文本
节点操作(addClass()与removeClass()动态改变节点class属性)
ddClass()与removeClass()动态改变节点class属性 html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' #addClass()与removeClass()动态改变节点class属性 from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) #调用removeClass()方法将li节点的active这个class移除 li.removeClass('active') print(li) #调用addClass()方法将class添加回来 li.addClass('active') print(li) 结果 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
attr、text、html改变节点属性以及节点内部的内容
html=''' <ul class='list'> <li class="item-0 active"><a href="link3.html"><span class="blod">third item</spam></a></li> </ul>''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) #attr()方法修改属性,第一个参数为属性名第二个参数为属性值 li.attr('name','link') print(li) #text()改变节点内容 li.text('changed item') print(li) #html()方法改变节点内容 li.html('<span>changed item</span>') print(li) 结果 <li class="item-0 active"><a href="link3.html"><span class="blod">third item</span></a></li> <li class="item-0 active" name="link"><a href="link3.html"><span class="blod">third item</span></a></li> <li class="item-0 active" name="link">changed item</li> <li class="item-0 active" name="link"><span>changed item</span></li>
节点文本内容移除(remove()方法)
html = ''' <div class="warp"> Hello World <p>This is a paragraph.</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html) warp = doc('.warp') print(warp.text()) #去掉p节点内部的文本 warp.find('p').remove() print(warp.text()) 结果 Hello World This is a paragraph. Hello World