爬虫--PyQuery
什么是PyQuery?
PyQuery
初始化
字符串初始化
from pyquery import PyQuery as pq html=""" <div> <ul> <li class="item-0">first item</li> <li class="item-1"> <a href="link2.html">second item</a> </li> <li class="item-0 active"> <a href="link3.html> <span class="bold">third item</span> </a> </li> <li class="item-1 active"> <a href="link4.html">f ourth item </a> </li> <li class="item-0"> <a href="link5.html"> fifth item </a> </li> </ul> </div> """ doc=pq(html) print(doc("li"))
<li class="item-0">first item</li> <li class="item-1"> <a href="link2.html">second item</a> </li> <li class="item-0 active"> <a href="link3.html> <span class=" bold="">third item </a> </li> <li class="item-1 active"> <a href="link4.html">f ourth item </a> </li> <li class="item-0"> <a href="link5.html"> fifth item </a> </li>
URL初始化
from pyquery import PyQuery as pq doc = pq(url="http://www.baidu.com") print(doc("head"))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head>
文件初始化
from pyquery import PyQuery as pq doc = pq(filename="demo.html") print(doc("li"))
基本CSS选择器
from pyquery import PyQuery as pq html= """ <div id = "container"> <ul> <li class="item-0"> first item </li> <li class="item-1"> <a href="link2.html">second item</a> </li> <li class="item-2 active"> <a href="link3.html> <span class="bold">third item</span> </a> </li> <li class="item-3 active"> <a href="link4.html">f ourth item </a> </li> <li class="item-4"> <a href="link5.html"> fifth item </a> </li> </ul> </div> """ doc = pq(html) print(doc("#container .item-0"))
<li class="item-0"> first item </li>
查找元素
子元素
from pyquery import PyQuery as pq html= """ <div id = "container"> <ul class="list> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> """ doc = pq(html) items = doc(".list") print(type(items)) docs = doc.find("li") print(type(docs)) print(docs)
<class 'pyquery.pyquery.PyQuery'> <class 'pyquery.pyquery.PyQuery'> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li>
from pyquery import PyQuery as pq html= """ <div id = "container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> """ doc = pq(html) items = doc(".list") docs = items.children() # 查找所有的直接子元素 docs1 = items.children(".active") print(docs) print(docs1)
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li>
父元素
from pyquery import PyQuery as pq html= """ <div id = "container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> """ doc = pq(html) items = doc(".list") docs = items.parent() # 查找所有的直接父元素 print(docs) print(type(docs))
<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <class 'pyquery.pyquery.PyQuery'>
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".list") docs = items.parents() # 查找所有的直接子元素 print(type(docs)) print(docs)
<class 'pyquery.pyquery.PyQuery'> <html><body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> </div></div></body></html><body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> </div></div></body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> </div></div><div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div>
兄弟元素
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-0") print(type(items.siblings())) print(items.siblings())
<class 'pyquery.pyquery.PyQuery'> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li>
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-0active") # 同时匹配.item-0和active,若都包含item-0 + active则打印,否则不打印 item = doc(".item-0") print(item.siblings()) print(type(items.siblings())) print(items.siblings())
<li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> <class 'pyquery.pyquery.PyQuery'>
遍历
单个元素
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-0.active") # 同时匹配.item-0和active,若都包含item-0 + active则打印,否则不打印 print(type(items)) print(items)
<class 'pyquery.pyquery.PyQuery'> <li class="item-0 active">first item</li>
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc("li").items() # 遍历items ,此时的items是一个迭代器 print(type(items)) for li in items: print(li)
<class 'generator'> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class=" bold="">third item</a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li>
获取信息
获取属性
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1 a") print(items.attr("href")) print(items.attr.href)
link2.html
link2.html
获取文本
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1") print(items.text())
second item
获取html
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1") print(items) print(items.html())
<li class="item-1"><a href="link2.html">second item</a></li> <a href="link2.html">second item</a>
DOM操作
addClass , removeClass
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1 active"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1") print(items.remove_class("active")) print(items.add_class("actives"))
<li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 actives"><a href="link2.html">second item</a></li>
attr , css
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1 active"><a href="link2.html">second item</a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1") print(items.attr("name","names")) print(items.css("font-size","14px"))
<li class="item-1 active" name="names"><a href="link2.html">second item</a></li> <li class="item-1 active" name="names" style="font: 14px"><a href="link2.html">second item</a></li>
remove
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) items = doc(".item-1") print(items.text()) print("---------------") items.find('p').remove() print(items.text())
second item Third times --------------- second item
其他DOM方法
https://pythonhosted.org/pyquery/api.html
伪类选择器
from pyquery import PyQuery as pq html= """ <div class = "wrap"> <div id = "container"> <ul class="list"> <li class="item-0 active">first item</li> <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li> <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> </ul> </div> <div> """ doc = pq(html) li = doc("li:first-child") print(li) print("------------------------------------------------------------") li = doc("li:last-child") print(li) print("------------------------------------------------------------") li = doc("li:gt(2)") print(li) print("------------------------------------------------------------") li = doc("li:nth-child(2)") # 指定一个索引顺序,获取第二个li标签 print(li) print("------------------------------------------------------------") li = doc("li:nth-child(2n)") # 指定一个索引顺序,获取偶数的li标签 print(li) print("------------------------------------------------------------") li = doc("li:contains(second)") # 查找包含second的文本标签 print(li)
<li class="item-0 active">first item</li> ------------------------------------------------------------ <li class="item-4"><a href="link5.html">fifth item</a></li> ------------------------------------------------------------ <li class="item-3 active"><a href="link4.html">fourth item</a></li> <li class="item-4"><a href="link5.html">fifth item</a></li> ------------------------------------------------------------ <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li> ------------------------------------------------------------ <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li> <li class="item-3 active"><a href="link4.html">fourth item</a></li> ------------------------------------------------------------ <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>
更多的CSS选择器可以查看:http://www.w3school.com.cn/css/index.asp
Pyquery的官方文档可以查看:https://pythonhosted.org/pyquery/api.html