PYTHON 爬虫笔记六:PyQuery库基础用法

知识点一:PyQuery库详解及其基本使用

  • 初始化

  1. 字符串初始化

    html = '''
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a><>/li
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
     
    print(doc('li'))#选择器实际上就是CSS选择器,即:选id就加“#”,选class前面加“.”
    <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
            </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
    获得的结果
  2. URL初始化

    from pyquery import PyQuery as pq
    doc1 = pq(url = "http://www.baidu.com")
    
    print(doc1("head"))
    <head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head> 
    获得的结果
  3. 文件初始化

    from pyquery import PyQuery as pq
    doc2 = pq(filename = "demo.html")#自己下载一个HTML文件
     
    print(doc2('li'))
  • 基本CSS选择器

  1. 实例

    tml = '''
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a><>/li
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    print(doc("#container .list li"))#注意空格,空格代表嵌套关系
    <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
            </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
    获得的结果
  2. 查询元素

    1. 子元素

      html = '''
      <div id="container">
          <ul class="list">
              <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a><>/li
              <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          </ul>
      </div>
      '''
      from pyquery import PyQuery as pq
      doc = pq(html)
      items = doc(".list")#首先选中url标签
      
      print(type(items))
      print(items)
       
      lis = items.find('li')#实际上也是一个CSS选择器,将里面所有的li标签都打印出来;只要在它里面的标签都可以找到
      print(type(lis))
      print(lis)
      
      #查找直接子元素
      lis2 = items.children()
      print(type(lis2))
      print(lis2)
       
      lis3 = items.children('.active')
      print(lis3)
      <class 'pyquery.pyquery.PyQuery'>
      <ul class="list">
              <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
              </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          </ul>
      
      <class 'pyquery.pyquery.PyQuery'>
      <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
              </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          
      <class 'pyquery.pyquery.PyQuery'>
      <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
              </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          
      <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              
      获得的结果
    2. 父元素

      #父元素
      html = '''
      <div id="container">
          <ul class="list">
              <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a></li>
              <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          </ul>
      </div>
      '''
      from pyquery import PyQuery as pq
      doc = pq(html)
       
      items = doc(".list")#首先选中url标签
      #每个标签外面肯定只能套一个父元素
      container = items.parent()
      
      print(type(container))
      print(container)
      <class 'pyquery.pyquery.PyQuery'>
      <div id="container">
          <ul class="list">
              <li class="item-0">first item</li>
              <li class="item-1"><a href="link2.html">second item</a></li>
              <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
              <li class="item-0"><a href="link5.html">fifth item</a></li>
          </ul>
      </div>
      获得的结果

      另一种方法:

      html = '''
      <div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a><>/li
                  <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      '''
       
      from pyquery import PyQuery as pq
      doc = pq(html)
      items = doc(".list")#首先选中url标签
      #将所有祖先节点返回
      parents = items.parents()
      
      print(parents)
      print(type(parents))#打印出两个div
      另一种方法
      <html><body><div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      </body></html><body><div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      </body><div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      
      <class 'pyquery.pyquery.PyQuery'>
      --->获得的结果
      html = '''
      <div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a><>/li
                  <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      '''
       
      from pyquery import PyQuery as pq
      doc = pq(html)
      items = doc(".list")#首先选中url标签
      
      #在其中进行搜索
      parents1 = items.parents(".wrap")
      
      print(parents1)#通过筛选,只剩下一个div
      获取单一内容
      <div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      --->获得的结果
    3. 兄弟元素

      html = '''
      <div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a><>/li
                  <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      '''
      from pyquery import PyQuery as pq
      doc = pq(html)
      li = doc('.list .item-0.active')#首先选class=“.list”,空格即使选择list里面的标签,再选class=“item-0”,并列active(实际就是一个整体)
      print(li)
      print(li.siblings())#获取所有的兄弟元素
      <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  
      <li class="item-1"><a href="link2.html">second item</a>&lt;&gt;/li
                  </li><li class="item-0">first item</li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              
      获得的结果

      另一种方式:

      html = '''
      <div class="wrap">
          <div id="container">
              <ul class="list">
                  <li class="item-0">first item</li>
                  <li class="item-1"><a href="link2.html">second item</a><>/li
                  <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                  <li class="item-0"><a href="link5.html">fifth item</a></li>
              </ul>
          </div>
      </div>
      '''
      from pyquery import PyQuery as pq 
      doc = pq(html)
       
      li = doc('.list .item-0.active')#首先选class=“.list”,空格即使选择list里面的标签,再选class=“item-0”,并列active(实际就是一个整体)
      #在向其中筛选
      print(li.siblings('.active'))
      另一种方式
      <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      --->获得的结果
  •  遍历

  1. 单个元素

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
     
    li = doc(".item-0.active")
    print(li)
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
    获得的方法

    另一种方式

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq 
    doc = pq(html)
    
    lis = doc('li').items()#多个元素,进行遍历,生成一个产生器
    
    print(type(lis))
    for li in lis:
        print(li)
    另一种方式
    <class 'generator'>
    <li class="item-0">first item</li>
                
    <li class="item-1"><a href="link2.html">second item</a></li>
                
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                
    <li class="item-0"><a href="link5.html">fifth item</a></li>
            
    --->获得的结果
  • 获取信息

  1. 获取属性

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
     
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc(".item-0.active a")#选择class同时为item-0和active,在选择class里面的啊标签,中间注意空格
    print(a)
    print(a.attr("href"))
    print(a.attr.href)#结果同上
    <a href="link3.html"><span class="boid">third item</span></a>
    link3.html
    link3.html
    获得的结果
  2. 获取文本

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc(".item-0.active a")
    
    print(a)
    print(a.text())#将上面的选中的class中包围的文字
    <a href="link3.html"><span class="boid">third item</span></a>
    third item
    获得的结果
  3. 获取HTML

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
     
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc(".item-0.active")
    
    print(a)
    print(a.html())
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <a href="link3.html"><span class="boid">third item</span></a>
    获得的结果
  • DOM操作

  1. address,removeClass

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    li = doc(".item-0.active")
    print(li)
    
    li.removeClass("active")#移除active
    print(li)
    
    li.addClass("active")#增加active
    print(li)
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <li class="item-0"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    获得的结果
  2. attr,css

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
     
    li = doc(".item-0.active")
    print(li)
    
    li.attr("name","link")#若存在,就会覆盖
    print(li)
    
    li.css("font-size","14px")#增加style属性
    print(li)
    <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <li class="item-0 active" name="link"><a href="link3.html"><span class="boid">third item</span></a></li>
                
    <li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="boid">third item</span></a></li>
    获得的结果
  3. remove

    html1 = '''
    <div class="wrap">
        Hello,World
        <p>This is a paragraph.</p>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html1)
     
    wrap = doc(".wrap")
    print(wrap.text())
     
    wrap.find('p').remove()
     
    print(wrap.text())
    Hello,World
    This is a paragraph.
    Hello,World
    获得的结果
  4. 其他DOM操作

    1. 其他DOM方法: http://pythonhosted.org/pyquery/
  • 伪类选择器

    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
     
    li = doc("li:first-child")#第一个
    print(li)            
     
    li1 = doc('li:last-child')#最后一个
    print(li1)        
     
    li2 = doc('li:nth-child(2)')#指定缩写顺序,第二个
    print(li2)
     
    li3 = doc("li:gt(2)")#大于2的(从0开始)
    print(li3)
     
    li4 = doc("li:nth-child(2n)")#偶数
    print(li4)
     
    li5 = doc("li:contains(second)")#内容包含second
    print(li5)
    <li class="item-0">first item</li>
                
    <li class="item-0"><a href="link5.html">fifth item</a></li>
            
    <li class="item-1"><a href="link2.html">second item</a></li>
                
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            
    <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                
    <li class="item-1"><a href="link2.html">second item</a></li>
                
    获得的结果
  • 官方文档

 

posted @ 2018-08-09 10:05  达尔文在思考  阅读(920)  评论(0编辑  收藏  举报