PYTHON 爬虫笔记六:PyQuery库基础用法
知识点一:PyQuery库详解及其基本使用
-
初始化
-
字符串初始化
html = ''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('li'))#选择器实际上就是CSS选择器,即:选id就加“#”,选class前面加“.”
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
-
URL初始化
from pyquery import PyQuery as pq doc1 = pq(url = "http://www.baidu.com") print(doc1("head"))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head>
-
文件初始化
from pyquery import PyQuery as pq doc2 = pq(filename = "demo.html")#自己下载一个HTML文件 print(doc2('li'))
-
基本CSS选择器
-
实例
tml = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc("#container .list li"))#注意空格,空格代表嵌套关系
<li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
-
查询元素
-
子元素
html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc(".list")#首先选中url标签 print(type(items)) print(items) lis = items.find('li')#实际上也是一个CSS选择器,将里面所有的li标签都打印出来;只要在它里面的标签都可以找到 print(type(lis)) print(lis) #查找直接子元素 lis2 = items.children() print(type(lis2)) print(lis2) lis3 = items.children('.active') print(lis3)
<class 'pyquery.pyquery.PyQuery'> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> <class 'pyquery.pyquery.PyQuery'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <class 'pyquery.pyquery.PyQuery'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li>
-
父元素
#父元素 html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc(".list")#首先选中url标签 #每个标签外面肯定只能套一个父元素 container = items.parent() print(type(container)) print(container)
<class 'pyquery.pyquery.PyQuery'> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
另一种方法:
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc(".list")#首先选中url标签 #将所有祖先节点返回 parents = items.parents() print(parents) print(type(parents))#打印出两个div
<html><body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> </body></html><body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> </body><div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> <class 'pyquery.pyquery.PyQuery'>
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc(".list")#首先选中url标签 #在其中进行搜索 parents1 = items.parents(".wrap") print(parents1)#通过筛选,只剩下一个div
<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>
-
兄弟元素
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active')#首先选class=“.list”,空格即使选择list里面的标签,再选class=“item-0”,并列active(实际就是一个整体) print(li) print(li.siblings())#获取所有的兄弟元素
<li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1"><a href="link2.html">second item</a><>/li </li><li class="item-0">first item</li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
另一种方式:
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a><>/li <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active')#首先选class=“.list”,空格即使选择list里面的标签,再选class=“item-0”,并列active(实际就是一个整体) #在向其中筛选 print(li.siblings('.active'))
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
-
-
遍历
-
单个元素
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc(".item-0.active") print(li)
<li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
另一种方式
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items()#多个元素,进行遍历,生成一个产生器 print(type(lis)) for li in lis: print(li)
<class 'generator'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li>
-
获取信息
-
获取属性
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc(".item-0.active a")#选择class同时为item-0和active,在选择class里面的啊标签,中间注意空格 print(a) print(a.attr("href")) print(a.attr.href)#结果同上
<a href="link3.html"><span class="boid">third item</span></a> link3.html link3.html
-
获取文本
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc(".item-0.active a") print(a) print(a.text())#将上面的选中的class中包围的文字
<a href="link3.html"><span class="boid">third item</span></a> third item
-
获取HTML
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc(".item-0.active") print(a) print(a.html())
<li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <a href="link3.html"><span class="boid">third item</span></a>
-
DOM操作
-
address,removeClass
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc(".item-0.active") print(li) li.removeClass("active")#移除active print(li) li.addClass("active")#增加active print(li)
<li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-0"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
-
attr,css
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc(".item-0.active") print(li) li.attr("name","link")#若存在,就会覆盖 print(li) li.css("font-size","14px")#增加style属性 print(li)
<li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-0 active" name="link"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="boid">third item</span></a></li>
-
remove
html1 = ''' <div class="wrap"> Hello,World <p>This is a paragraph.</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html1) wrap = doc(".wrap") print(wrap.text()) wrap.find('p').remove() print(wrap.text())
Hello,World This is a paragraph. Hello,World
-
其他DOM操作
- 其他DOM方法: http://pythonhosted.org/pyquery/
-
伪类选择器
html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc("li:first-child")#第一个 print(li) li1 = doc('li:last-child')#最后一个 print(li1) li2 = doc('li:nth-child(2)')#指定缩写顺序,第二个 print(li2) li3 = doc("li:gt(2)")#大于2的(从0开始) print(li3) li4 = doc("li:nth-child(2n)")#偶数 print(li4) li5 = doc("li:contains(second)")#内容包含second print(li5)
<li class="item-0">first item</li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-1"><a href="link2.html">second item</a></li>
-
更多CSS选择器可以查看:http://www.w3school.com.cn/css/index.asp
-
-
官方文档
这都是我对自己学习过程的理解,如有错误请指出!我算一个小白了。