【Python爬虫】PyQuery解析库
PyQuery 是 Python 仿照 jQuery 的严格实现。语法与 jQuery 几乎完全相同。
官方文档:http://pyquery.readthedocs.io/
安装
pip install pyquery
初始化
字符串初始化
html = ''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('li'))
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
URL初始化
from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') print(doc('head'))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head>
文件初始化
from pyquery import PyQuery as pq doc = pq(filename='demo.html') print(doc('li'))
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
基本CSS选择器
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li'))
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
查找元素
子元素
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') print(type(items)) print(items) lis = items.find('li') print(type(lis)) print(lis)
<class 'pyquery.pyquery.PyQuery'> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> <class 'pyquery.pyquery.PyQuery'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') lis = items.children() print(type(lis)) print(lis)
<class 'pyquery.pyquery.PyQuery'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') lis = items.children('.active') print(lis)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li>
父元素
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') container = items.parent() print(type(container)) print(container)
<class 'pyquery.pyquery.PyQuery'> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') parents = items.parents() print(type(parents)) print(parents)
<class 'pyquery.pyquery.PyQuery'> <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div><div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
parent = items.parents('.wrap') print(parent)
<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>
兄弟元素
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings())
<li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0">first item</li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings('.active'))
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
遍历
单个元素
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items() print(type(lis)) for li in lis: print(li)
<class 'generator'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
获取信息
获取属性
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.attr('href')) print(a.attr.href)
<a href="link3.html"><span class="bold">third item</span></a> link3.html link3.html
获取文本
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.text())
<a href="link3.html"><span class="bold">third item</span></a> third item
获取HTML
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) print(li.html())
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <a href="link3.html"><span class="bold">third item</span></a>
DOM操作
addClass、removeClass
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active') print(li) li.addClass('active') print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
attr、css
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name', 'link') print(li) li.css('font-size', '14px') print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
remove
html = ''' <div class="wrap"> Hello, World <p>This is a paragraph.</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() print(wrap.text())
Hello, World This is a paragraph. Hello, World
其他DOM方法 http://pyquery.readthedocs.io/en/latest/api.html
伪类选择器
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') print(li) li = doc('li:last-child') print(li) li = doc('li:nth-child(2)') print(li) li = doc('li:gt(2)') print(li) li = doc('li:nth-child(2n)') print(li) li = doc('li:contains(second)') print(li)
<li class="item-0">first item</li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li>
更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp