Python爬虫系列-PyQuery详解

强大又灵活的网页解析库。如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法,那么PyQuery就是你的最佳选择。

安装

pip3 install pyquery

用法讲解

字符串初始化

html='''
 <div>
   <ul>
     <li class="item-0">first item</li>
     <li class="item-1"><a href="link2.html">second item</a></li>
     <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     <li class="item-0"><a href="link5.html">fifth item</a></li>
   </ul>
 </div>
 '''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

显示效果如下:

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>

URL初始化

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com',encoding='utf-8')
print(doc('head'))

直接输入网址,显示效果如下:

<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>百度一下,你就知道</title></head> 

文件初始化

from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))

基本CSS选择器

 html = '''<div id="container">\n  <ul class="list">\n    <li class="item-0">first item</li>\n    <li class="item-1"><a href="link2.html">second item</a></li>\n    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>\n    <li class="item-1 active"><a href="link4.html">fourth item</a></li>\n    <li class="item-0"><a href="link5.html">fifth item</a></li>\n  </ul>\n</div>
... '''
 from pyquery import PyQuery as pq
 doc = pq(html)
 print(doc('#container .list li'))

输出效果:

    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>

查找元素

子元素

 html = '''
<div id="container">\n  <ul class="list">\n    <li class="item-0">first item</li>\n    <li class="item-1"><a href="link2.html">second item</a></li>\n    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>\n    <li class="item-1 active"><a href="link4.html">fourth item</a></li>\n    <li class="item-0"><a href="link5.html">fifth item</a></li>\n  </ul>\n</div>\n'''
from pyquery import PyQuery as pq
 doc = pq(html)
 items = doc('.list')
 print(type(items))
 print(items)

显示如下结果:
<class 'pyquery.pyquery.PyQuery'>

  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
 lis = items.find('li')
 print(type(lis))
 print(lis)

显示结果:
<class 'pyquery.pyquery.PyQuery'>

    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
 lis = items.children()
 print(type(lis))
 print(lis)

显示输出结果:
<class 'pyquery.pyquery.PyQuery'>

   <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
lis = items.children('.active')
print(lis)

显示如下:

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>

父元素

 from pyquery import PyQuery as pq
 doc = pq(html)
 items = doc('.list')
 container = items.parent()
 print(type(container))
 print(container)

显示内容
<class 'pyquery.pyquery.PyQuery'>

<div id="container">
  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div>

显示所有父节点

from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.item-0')
parents = items.parents()
print(type(parents))
print(parents)

显示如下:
<class 'pyquery.pyquery.PyQuery'>

<div id="container">
  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div><ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>

查找指定父元素 parents = items.parents(‘container’)

兄弟元素

li = doc('.list .item-0.active')
print(li.siblings())

遍历

单个元素

<div id="container">\n  <ul class="list">\n    <li class="item-0">first item</li>\n    <li class="item-1"><a href="link2.html">second item</a></li>\n    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>\n    <li class="item-1 active"><a href="link4.html">fourth item</a></li>\n    <li class="item-0"><a href="link5.html">fifth item</a></li>\n  </ul>\n</div>
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)

只能输出一个元素:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

遍历方法:

 from pyquery import PyQuery as pq
 doc = pq(html)
 lis = doc('li').items()
 print(type(lis))
 for li in lis:
    print(li)

<class 'generator'>

<li class="item-0">first item</li>
    
<li class="item-1"><a href="link2.html">second item</a></li>
    
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
    
<li class="item-0"><a href="link5.html">fifth item</a></li>

获取信息

获取属性

from pyquery import PyQuery as pq
doc = pq(html)
d = doc('.item-0.active a')
a = doc('.item-0.active a')
 print(a)
print(a.attr['href'])
print(a.attr.href)

显示结果:
<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html

获取文本

 from pyquery import PyQuery as pq
 doc = pq(html)
 a = doc('.item-0.active a')
 print(a.text())

显示结果:

third item

获取HTML

print(a)
print(a.html())
<a href="link3.html"><span class="bold">third item</span></a>
<span class="bold">third item</span>

DOM操作

addClass、removeClass

<div id="container">\n  <ul class="list">\n    <li class="item-0">first item</li>\n    <li class="item-1"><a href="link2.html">second item</a></li>\n    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>\n    <li class="item-1 active"><a href="link4.html">fourth item</a></li>\n    <li class="item-0"><a href="link5.html">fifth item</a></li>\n  </ul>\n</div>
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

li.removeClass('active')
print(li)

<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>

li.addClass('active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

attr、css

from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

li.attr('name','link')
print(li)显示结果
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>

li.css('font-size','14px')
print(li)

<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>

remove

 html = '''
 <div class="wrap">
   Hello,World
   <p>This is a paragraph</p>
 </div>
'''
 from pyquery import PyQuery as pq
 doc = pq(html)
 wrap = doc('.wrap')
 wrap.find('p').remove()
 print(wrap.text())

Hello,World

伪类选择器

html = '''
<div>
   <ul>
     <li class="item-0">first item</li>
     <li class="item-1"><a href="link2.html">second item</a></li>
     <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     <li class="item-0"><a href="link5.html">fifth item</a></li>
   </ul>
 </div>
from pyquery import PyQuery as pq
 doc = pq(html)
 li = doc('li:first-child')
 print(li)

<li class="item-0">first item</li>

 li = doc('li:last-child')
 print(li)

<li class="item-0"><a href="link5.html">fifth item</a></li>

 li = doc('li:nth-child(2)')
 print(li)

<li class="item-1"><a href="link2.html">second item</a></li>

 li = doc('li:gt(2)')
 print(li)
  • fourth item
  • fifth item
  • ``` ```py li = doc('li:nth-child(2n)') print(li) ``` > ```html
  • second item
  • fourth item
  • ``` ```py li = doc('li:contains(second)') print(li) ``` > `
  • second item
  • `
    posted @ 2018-12-03 13:21  cicarius  阅读(525)  评论(0编辑  收藏  举报