爬虫--PyQuery

什么是PyQuery?

PyQuery

初始化

字符串初始化

from pyquery import PyQuery as pq

html="""
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1">
                <a href="link2.html">second item</a>
            </li>
            <li class="item-0 active">
                <a href="link3.html>
                    <span class="bold">third item</span>
                </a>
            </li>
            <li class="item-1 active">
                <a href="link4.html">f
                    ourth item
                </a>
            </li>
            <li class="item-0">
                <a href="link5.html">
                    fifth item
                </a>
            </li>
        </ul>
    </div>
"""

doc=pq(html)
print(doc("li"))
<li class="item-0">first item</li>
<li class="item-1">
<a href="link2.html">second item</a>
</li>
 <li class="item-0 active">
 <a href="link3.html&gt;&#10;&lt;span class=" bold="">third item
</a>
</li>
<li class="item-1 active">
<a href="link4.html">f
ourth item
</a>
</li>
<li class="item-0">
<a href="link5.html">
 fifth item
 </a>
 </li>
打印后的结果为:

URL初始化

from pyquery import PyQuery as pq

doc = pq(url="http://www.baidu.com")
print(doc("head"))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>百度一下,你就知道</title></head> 
打印后的结果为:

文件初始化

from pyquery import PyQuery as pq

doc = pq(filename="demo.html")
print(doc("li"))

基本CSS选择器

from pyquery import PyQuery as pq

html= """
    <div id = "container">
        <ul>
            <li class="item-0">
                first item
            </li>
            <li class="item-1">
                <a href="link2.html">second item</a>
            </li>
            <li class="item-2 active">
                <a href="link3.html>
                    <span class="bold">third item</span>
                </a>
            </li>
            <li class="item-3 active">
                <a href="link4.html">f
                    ourth item
                </a>
            </li>
            <li class="item-4">
                <a href="link5.html">
                    fifth item
                </a>
            </li>
        </ul>
    </div>
"""
doc = pq(html)
print(doc("#container .item-0"))
<li class="item-0">
 first item
 </li>
打印的结果为:

查找元素

子元素

from pyquery import PyQuery as pq

html= """
    <div id = "container">
        <ul class="list>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
            <li class="item-3 active"><a href="link4.html">fourth item</a></li>
            <li class="item-4"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
"""
doc = pq(html)
items = doc(".list")
print(type(items))
docs = doc.find("li")
print(type(docs))
print(docs)
<class 'pyquery.pyquery.PyQuery'>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
<li class="item-3 active"><a href="link4.html">fourth item</a></li>
<li class="item-4"><a href="link5.html">fifth item</a></li>
打印后的结果为:
from pyquery import PyQuery as pq

html= """
    <div id = "container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
            <li class="item-3 active"><a href="link4.html">fourth item</a></li>
            <li class="item-4"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
"""
doc = pq(html)
items = doc(".list")
docs = items.children() # 查找所有的直接子元素
docs1 = items.children(".active")
print(docs)
print(docs1)
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
<li class="item-3 active"><a href="link4.html">fourth item</a></li>
<li class="item-4"><a href="link5.html">fifth item</a></li>   
<li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
<li class="item-3 active"><a href="link4.html">fourth item</a></li>
打印后的结果为:

父元素

from pyquery import PyQuery as pq

html= """
    <div id = "container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
            <li class="item-3 active"><a href="link4.html">fourth item</a></li>
            <li class="item-4"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
"""
doc = pq(html)
items = doc(".list")
docs = items.parent() # 查找所有的直接父元素
print(docs)
print(type(docs))
<div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
            <li class="item-3 active"><a href="link4.html">fourth item</a></li>
            <li class="item-4"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>

<class 'pyquery.pyquery.PyQuery'>
打印后的结果:
from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".list")
docs = items.parents() # 查找所有的直接子元素
print(type(docs))
print(docs)
<class 'pyquery.pyquery.PyQuery'>
<html><body><div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
</div></div></body></html><body><div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
</div></div></body><div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
</div></div><div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
打印后的结果为:

兄弟元素

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-0")
print(type(items.siblings()))
print(items.siblings())
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
<li class="item-3 active"><a href="link4.html">fourth item</a></li>
<li class="item-4"><a href="link5.html">fifth item</a></li>
打印后的结果为:
from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-0active") # 同时匹配.item-0和active,若都包含item-0 + active则打印,否则不打印
item = doc(".item-0")
print(item.siblings())
print(type(items.siblings()))
print(items.siblings())
<li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            
<class 'pyquery.pyquery.PyQuery'>
打印后的结果为:

遍历

单个元素

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-0.active") # 同时匹配.item-0和active,若都包含item-0 + active则打印,否则不打印
print(type(items))
print(items)
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active">first item</li>
                
打印后的结果为:
from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc("li").items() # 遍历items ,此时的items是一个迭代器
print(type(items))
for li in items:
    print(li)
<class 'generator'>
<li class="item-0 active">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>       
<li class="item-2 active"><a href="link3.html&gt;&lt;span class=" bold="">third item</a></li>         
<li class="item-3 active"><a href="link4.html">fourth item</a></li>      
<li class="item-4"><a href="link5.html">fifth item</a></li>
打印后的结果为:

获取信息

获取属性

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1 a")
print(items.attr("href"))
print(items.attr.href)
link2.html
link2.html
打印后的结果为:

获取文本

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1")
print(items.text())
second item
打印后的结果为:

获取html

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1")
print(items)
print(items.html())
<li class="item-1"><a href="link2.html">second item</a></li>
                
<a href="link2.html">second item</a>
打印后的结果为:

DOM操作

addClass , removeClass

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1 active"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1")
print(items.remove_class("active"))
print(items.add_class("actives"))
<li class="item-1"><a href="link2.html">second item</a></li>
                
<li class="item-1 actives"><a href="link2.html">second item</a></li>
打印后的结果为:

attr , css

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1 active"><a href="link2.html">second item</a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1")
print(items.attr("name","names"))
print(items.css("font-size","14px"))
<li class="item-1 active" name="names"><a href="link2.html">second item</a></li>
                
<li class="item-1 active" name="names" style="font: 14px"><a href="link2.html">second item</a></li>
打印后的结果为:

remove

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
items = doc(".item-1")
print(items.text())
print("---------------")
items.find('p').remove()
print(items.text())
second item
Third times
---------------
second item
打印后的结果为:

其他DOM方法

https://pythonhosted.org/pyquery/api.html

 

伪类选择器

from pyquery import PyQuery as pq

html= """
    <div class = "wrap">
        <div id = "container">
            <ul class="list">
                <li class="item-0 active">first item</li>
                <li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>
                <li class="item-2 active"><a href="link3.html><span class="bold">third item</span></a></li>
                <li class="item-3 active"><a href="link4.html">fourth item</a></li>
                <li class="item-4"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    <div>
"""

doc = pq(html)
li = doc("li:first-child")
print(li)
print("------------------------------------------------------------")
li = doc("li:last-child")
print(li)
print("------------------------------------------------------------")
li = doc("li:gt(2)")
print(li)
print("------------------------------------------------------------")
li = doc("li:nth-child(2)") # 指定一个索引顺序,获取第二个li标签
print(li)
print("------------------------------------------------------------")
li = doc("li:nth-child(2n)") # 指定一个索引顺序,获取偶数的li标签
print(li)
print("------------------------------------------------------------")
li = doc("li:contains(second)") # 查找包含second的文本标签
print(li)
<li class="item-0 active">first item</li>         
------------------------------------------------------------
<li class="item-4"><a href="link5.html">fifth item</a></li>           
------------------------------------------------------------
<li class="item-3 active"><a href="link4.html">fourth item</a></li>
<li class="item-4"><a href="link5.html">fifth item</a></li>    
------------------------------------------------------------
<li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>                
------------------------------------------------------------
<li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>
 <li class="item-3 active"><a href="link4.html">fourth item</a></li>            
------------------------------------------------------------
<li class="item-1 active"><a href="link2.html">second item<p>Third times</p></a></li>
打印后的结果为:

 

更多的CSS选择器可以查看:http://www.w3school.com.cn/css/index.asp

 

Pyquery的官方文档可以查看:https://pythonhosted.org/pyquery/api.html

 

posted @ 2018-09-21 19:35  追风的小蚂蚁  阅读(420)  评论(0编辑  收藏  举报