pyquery解析库的介绍和使用

 

 

### pyquery的介绍和使用

## 测试文本
text = '''
<html><head><title>there is money</title></head>
<body>
<p class="title" name="dmr"><b>there is money</b>contents</p>
<p class="money">good good study, day day up
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
</p>
<p class='body'>...</p>
'''

 

1. pyquery对象初始化,html字符串,url,file皆可

## pyquery对象初始化,html字符串,url,file皆可
from pyquery import PyQuery as pq
import requests

# html字符串初始化
doc = pq(text)
print(doc('a'))
# url初始化
doc = pq(requests.get('https://www.baidu.com').text)
print(doc('title'))
# 读取文件内容初始化,编码格式为GBK,当有不可识别字符时会报错,可通过open指定编码格式为utf-8来解决
# doc = pq(filename='text')
# print(doc('li'))

 

2. 基本CSS选择器

## 基本CSS选择器
from pyquery import PyQuery as pq

doc = pq(text)
print(type(doc))
print(doc('.money a'))
print(doc('.money #l1'))
'''
输出内容:
<class 'pyquery.pyquery.PyQuery'>
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666

<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
'''
输出内容

 

3. 查找节点,返回一个PyQuery对象,当匹配到多个节点时,PyQuery对象值为多个节点的字符串整合

## 查找节点,返回一个PyQuery对象,当匹配到多个节点时,PyQuery对象值为多个节点的字符串整合
from pyquery import PyQuery as pq

doc = pq(text)
items = doc('p')
print(items)
print(type(items))
# 查找子节点
print(items.children('#l2'))
print('--------------------分隔符------------------')
# 查找父节点
print(items.parent())
print('--------------------分隔符------------------')
print(items.parents('html'))
print('--------------------分隔符------------------')
## 查找兄弟节点
print(items('#l2').siblings())
'''
输出内容:
<p class="title" name="dmr"><b>there is money</b></p>
<p class="money">good good study, day day up
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
</p>
<p class="body">...</p>

<class 'pyquery.pyquery.PyQuery'>
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 

--------------------分隔符------------------
<body>
<p class="title" name="dmr"><b>there is money</b></p>
<p class="money">good good study, day day up
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
</p>
<p class="body">...</p>
</body>
--------------------分隔符------------------
<html><head><title>there is money</title></head>
<body>
<p class="title" name="dmr"><b>there is money</b></p>
<p class="money">good good study, day day up
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
</p>
<p class="body">...</p>
</body></html>
--------------------分隔符------------------
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
'''
输出内容

 

4. 遍历,通过PyQuery对象的items方法可以把匹配到多个节点的PyQuery对象构造成一个生成器

## 遍历,通过PyQuery对象的items方法可以把匹配到多个节点的PyQuery对象构造成一个生成器
from pyquery import PyQuery as pq

doc = pq(doc)
print(doc('a'))
items = doc('a').items()
print(type(items))
for i, item in enumerate(items):
    print(i, item, type(item))
'''
输出内容:
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666

<class 'generator'>
0 <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
 <class 'pyquery.pyquery.PyQuery'>
1 <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
 <class 'pyquery.pyquery.PyQuery'>
2 <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
 <class 'pyquery.pyquery.PyQuery'>
'''
输出内容

 

5. 获取属性和文本

## 获取属性和文本
from pyquery import PyQuery as pq

doc = pq(doc)
# 获取属性,当多个节点时,同上,用items方法构造生成器然后进行遍历输出
a = doc('.error.ed2')
print(a, type(a))
print(a.attr('href'))
print(a.attr.href)

'''
输出结果:
<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
 <class 'pyquery.pyquery.PyQuery'>
https://www.baidu.com/2
https://www.baidu.com/2
'''

## 获取文本
from pyquery import PyQuery as pq

doc = pq(text)
print(doc('a.error'))
# 第一个a节点文本内容为注释内容,所以不输出
print(doc('a.error').text())  # 只输出节点内的文本内容
print(doc('a.error').html())  # 输出节点内的内容,包含标签内容
items = doc('a.error').items()
for i, item in enumerate(items):
    print(i, type(item), item.text())
    print(i, type(item), item.html())

'''
输出内容:
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666

 2 3
<span><!-- 1 --></span>
0 <class 'pyquery.pyquery.PyQuery'> 
0 <class 'pyquery.pyquery.PyQuery'> <span><!-- 1 --></span>
1 <class 'pyquery.pyquery.PyQuery'> 2
1 <class 'pyquery.pyquery.PyQuery'> <span>2</span>
2 <class 'pyquery.pyquery.PyQuery'> 3
2 <class 'pyquery.pyquery.PyQuery'> 3
'''

 

6. 节点操作

## 节点操作
from pyquery import PyQuery as pq

doc = pq(text)
p = doc('.title')
# addClass和removeClass,增加或减少class属性值 p.add_class('admin') print(p.attr.class_) p.removeClass('title') print(p.attr('class'))
# attr、text和html,修改属性、文本、HTML文本内容 print(p.attr.name) print(p.text()) print(p.html()) p.attr('name', 'test') print(p.attr.name) p.text('change text') print(p.text()) p.html('<span>change html</span>') print(p.html())
# remove,移除节点 doc = pq(text) p = doc('.title') print(p.html()) p.remove('b') print(p.html())
'''
输出内容:
title admin
admin
dmr
there is money
<b>there is money</b>
test
change text
<span>change html</span>
<b>there is money</b>contents
 contents
'''
输出内容

 

7. 伪类编辑器

# 伪类编辑器
from pyquery import PyQuery as pq

doc = pq(text)
# 第一个a节点
a = doc('a:first-child')
print(a)
# 最后一个a节点
a = doc('a:last-child')
print(a)
# 第二个a节点
a = doc('a:nth-child(2)')
print(a)
# 第0个节点之后的节点
a = doc('a:gt(0)')
print(a)
# 偶数位置的节点
a = doc('a:nth-child(2n)')
print(a)
# 文本内容包含3的节点
a = doc('a:contains("3")')
print(a)

'''
输出内容:
<a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,

<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666

<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 

<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666

<a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 

<a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
66666666666
'''

 

posted @ 2020-03-16 22:09  糕事情  阅读(631)  评论(0编辑  收藏  举报