pyquery解析库
语法和jquey几乎一致
安装
conda install pyquery
一、初始化
标准用法
from pyquery import PyQuery as pq import requests # r = requests.get(url='http://www.baidu.com') html_doc = pq(r.text) print(html_doc) print(html_doc('#u1 a'))
1、字符串初始化(最常用)
from pyquery import PyQuery as pq html_doc = '''<div> <ul id = 'haha'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) print(doc) print(type(doc))
2、url初始化
from pyquery import PyQuery as pq # html_doc = pq(url='http://www.baidu.com') print(html_doc) print(html_doc('#u1 a'))
注意:一般通过requests模块或urllib获取网页的html->解析模块去解析
3、文件初始化
from pyquery import PyQuery as pq # doc = pq(filename='test.html') print(doc)
二、基本CSS选择器
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) li_list = doc('div #con li') print(li_list) # id # # class . # tag tagname
三、查找节点
1、子节点
find() 最常用的方法
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) div = doc('div') li_list = div.find('li.active') print(li_list)
children() 查找所有子节点,children('') 查找指定的子节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) div = doc('div') # 查找所有子节点 selector = div.children() print(selector) # 查找含有item-0类的节点 li_item_0 = div.children('#con .item-0') print(li_item_0)
2、父节点
parent() 父节点 parents() 祖节点 parents('') 含有某些选择器祖节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # s所有li节点 li_list = doc('#con li') # li节点的父节点 ul = li_list.parent() # print(ul) # 祖辈节点(包含父节点) divs = li_list.parents() # print(divs) # 含有id="conn" 的祖节点 div = li_list.parents('#con') print(div)
3、兄弟节点
siblings() 所有兄弟姊妹节点,siblings('') 含有指定css选择器的兄弟节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 含有class="item-0 active"的节点 li = doc('#con li.item-0.active') # 查找所有兄弟节点(除了自己本身) # print(li.siblings()) # 查找含有指定css选择器的节点 print(li.siblings('.item-1.active'))
四、遍历
1、单个节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 单个节点 li = doc('#con li.item-0.active') print(li)
2、多个节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 多个节点,使用items()->生成器 li_lst = doc('#con li') for li in li_lst.items(): print(li, end='')
五、获取信息
1、属性
获取 设置
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 获取a标签的href属性 a = doc('li.item-0.active a') print(a.attr('href')) # 设置属性 a.attr('href', 'oj8k') print(a.attr('href'))
2、文本
text() html()
获取 设置
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 获取text() li = doc('li.item-0.active') print(li.text()) # 获取html() print(li.html()) # 设置text() li.text('Hello World') print(li.text()) # 设置html() li.html('<a>打我</a>') print(li.html())
注意:与JQuery的区别,pyquery(), html() 获取的是内部的html,不包含其本身
六、操作DOM节点
1、add_class()和remove_class()c
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 获取text() li = doc('li.item-0.active') print(li.text()) # 获取html() print(li.html()) # 设置text() li.text('Hello World') print(li.text()) # 设置html() li.html('<a>打我</a>') print(li.html())
2、remove()
作用:删除节点
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 获取li节点 li = doc('li.item-0.active') print(li) # 找到a节点,并删除a节点 a = li('a') a.remove() print(li)
七、伪类选择器
from pyquery import PyQuery as pq # html_doc = '''<div> <ul id = 'con'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' doc = pq(html_doc) # 获取li节点 li = doc('li.item-0.active') print(li) # 找到a节点,并删除a节点 a = li('a') a.remove() print(li)