XPath 的简单使用

XPath

XPath是一门在XML文档中查找信息的语言，它提供了非常简洁明了的路径选择表达式。

表达式	描述
nodename	选取此节点的所有子节点
/	从当前节点选取直接子节点
//	从当前节点选取子孙节点
.	选取当前节点
..	选取当前节点的父节点
@	选取属性

举例1：

复制 //title[@lang='eng']    它代表选择所有名称为title，同时属性lang的值为eng的节点

举例2：处理HTML文本

复制 from lxml import etree
 
html = etree.parse('./test.html', etree.HTMLParser())               # 直接对html文本进行解析
result = etree.tostring(html)
print(result.decode('utf-8'))
 
 
# 输出：
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>
</body></html>

举例3：获取子结点

用 // 开头的XPath规则来选取所有符合要求的节点

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)
 
# 输出：
[<Element html at 0x112829bc8>, <Element body at 0x112829d08>, <Element div at 0x112829d48>, <Element ul at 0x112829d88>, <Element li at 0x112829dc8>, <Element a at 0x112829e48>, <Element li at 0x112829e88>, <Element a at 0x112829ec8>, <Element li at 0x112829f08>, <Element a at 0x112829e08>, <Element li at 0x112fc21c8>, <Element a at 0x112fc2208>, <Element li at 0x112fc2248>, <Element a at 0x112fc2288>]

XPath 获取父结点

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
# result = html.xpath('//a[@href="link4.html"]/parent::*/@class)
print(result)
 
# 输出：
['item-1']

XPath 属性匹配

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)
 
# 输出：
[<Element li at 0x115357d08>]

XPath 文本获取

方式一

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)
 
# 输出：
['fifth item']

方式二

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]//text()')
print(result)
 
# 输出：
['fifth item', '\n']

XPath 属性获取

复制 from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
# 用@可以获取属性
result = html.xpath('//li/a/@href')
print(result)
 
# 输出：
['linkl.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']

XPath 属性多值匹配

有些属性可能有多个值，那么要匹配这些属性，则需要用到contains()函数

复制 from lxml import etree
 
# 这里的HTML文本中的li节点的class属性有两个值li和li-first
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# 获取text中的所有li节点中class属性是li的文本
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
 
# 输出：
['first item']

XPath 多属性值匹配

复制 from lxml import etree
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
 
# 输出：
['first item']

XPath 顺序选择

复制 from lxml import etree
text = '''
<div>
	<ul>
		<li class="item-O"><a href="linkl.html">first item</a></li>
		<li class="item-1"><a href="link2.html">second item</a></li>
		<li class="item-inactive"><a href="link3.html">third item</a></li>
		<li class="item-1"><a href="link4.html">fourth item</a></li>
		<li class="item-0"><a href="link5.html">fifth item</a>
	</ul>
</div>
'''
html = etree.HTML(text)
result1 = html.xpath('//li[1]/a/text()')                    # 选取第一个li节点
result2 = html.xpath('//li[last()]/a/text()')               # 选取最后一个li节点
result3 = html.xpath('//li[position()<3]/a/text()')         # 选取位置小于3的li节点
result4 = html.xpath('//li[last()-2]/a/text()')             # 选取倒数第3个li节点
 
print(result1, result2, result3, result4, sep='\n')
 
 
# 输出：
['first item']
['fifth item']
['first item', 'second item']
['third item']

XPath 结点轴选择

结点轴有：ancestor轴、attribute轴、child轴、descendant轴、following轴、following-sibling轴等

复制 from lxml import etree
 
text = '''
<div>
<ul>
<li class="item-O"><a href="linkl.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
14 html = etree.HTML(text)
15 result1 = html.xpath('//li[1]/ancestor::*')                 # 获取第1个li节点的所有祖先节点
result2 = html.xpath('//li[1]/ancestor::div')               # 获取第1个li节点的这个祖先节点
result3 = html.xpath('//li[1]/attribute::*')                # 获取第1个li节点的所有属性值
result4 = html.xpath('//li[1]/child::a[@href="link.html"]')             # 获取所有（href属性值为link.html的a节点）直接子节点
result5 = html.xpath('//li[1]/descendant::span')            # 获取所有子孙节点（获取span节点）
result6 = html.xpath('//li[1]/following::*[2]')             # 获取当前节点之后的第2个捷点
result7 = html.xpath('//li[1]/following-sibling::*')          # 获取当前节点之后的所有同级节点
 
print(result1, result2, result3, result4, result5, result6, result7, sep='\n')
 
 
# 输出：
[<Element html at 0x102e9f088>, <Element body at 0x10350fe08>, <Element div at 0x10350fd88>, <Element ul at 0x10350fd08>]
[<Element div at 0x10350fd88>]
['item-O']
[]
[<Element span at 0x10350fec8>]
[<Element a at 0x10350fe88>]
[<Element li at 0x10350ff48>, <Element li at 0x10350ff88>, <Element li at 0x10350ffc8>, <Element li at 0x111ba0048>]

运算符及其介绍

运算符	描述	实。例	返回值
or	或	price=9.80 or price=9.70	如果 price 是 9.80，则返回 true。如果 price 是 9.50，则返回 false。
and	与	price>9.00 and price<9.90	如果 price 是 9.80，则返回 true。如果 price 是 8.50，则返回 false。
mod	计算除法的余数	5 mod 2	1
\|	计算两个节点集	//book \| //cd	返回所有拥有 book 和 cd 元素的节点集
+	加法	6 + 6	12
-	减法	6 - 6	0
*	乘法	6 * 6	36
div	除法	6 div 6	1
=	等于	price=9.80	如果 price 是 9.80，则返回 true。如果 price 不是 9.90，则返回 false。
!=	不等于	price!=9.80	如果 price 不是 9.90，则返回 true。如果 price 是 9.80，则返回 false。
<	小于	age<20	如果 age 小于 20，则返回 true。如果 age 不小于 20，则返回 false。
<=	小于等于	age<=20	如果 age 小于等于 20，则返回 true。如果 age 大于 20，则返回 false
>	大于	age>20	如果 age 大于 20，则返回 true。如果 age 不大于 20，则返回 false
>=	大于等于	age>=20	如果 age 大于等于 20，则返回 true。如果 age 小于 20，则返回 false

posted @ 2021-03-04 12:29 LeeHua 阅读(332) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 震惊！C++程序真的从main开始吗？99%的程序员都答错了
· winform 绘制太阳，地球，月球运作规律
· 【硬核科普】Trae如何「偷看」你的代码？零基础破解AI编程运行原理
· 超详细：普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾（3.3-3.9）

Lee Hua's Blog

热爱编程 -- 写Bug

XPath 的简单使用

XPath

XPath 获取父结点

XPath 属性匹配

XPath 文本获取

XPath 属性获取

XPath 属性多值匹配

XPath 多属性值匹配

XPath 顺序选择

XPath 结点轴选择

运算符及其介绍

公告

我的标签

积分与排名

随笔分类 (240)

随笔档案 (226)

阅读排行榜

推荐排行榜

目录导航

	from lxml import etree

	html = etree.parse('./test.html', etree.HTMLParser()) # 直接对html文本进行解析
	result = etree.tostring(html)
	print(result.decode('utf-8'))


	# 输出：
	<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
	<html><body><div>
	<ul>
	<li class="item-O"><a href="linkl.html">first item</a></li>
	<li class="item-1"><a href="link2.html">second item</a></li>
	<li class="item-inactive"><a href="link3.html">third item</a></li>
	<li class="item-1"><a href="link4.html">fourth item</a></li>
	<li class="item-0"><a href="link5.html">fifth item</a>
	</li></ul>
	</div>
	</body></html>

	from lxml import etree
	html = etree.parse('./test.html', etree.HTMLParser())
	result = html.xpath('//*')
	print(result)

	# 输出：
	[<Element html at 0x112829bc8>, <Element body at 0x112829d08>, <Element div at 0x112829d48>, <Element ul at 0x112829d88>, <Element li at 0x112829dc8>, <Element a at 0x112829e48>, <Element li at 0x112829e88>, <Element a at 0x112829ec8>, <Element li at 0x112829f08>, <Element a at 0x112829e08>, <Element li at 0x112fc21c8>, <Element a at 0x112fc2208>, <Element li at 0x112fc2248>, <Element a at 0x112fc2288>]

	from lxml import etree
	html = etree.parse('./test.html', etree.HTMLParser())
	result = html.xpath('//a[@href="link4.html"]/../@class')
	# result = html.xpath('//a[@href="link4.html"]/parent::*/@class)
	print(result)

	# 输出：
	['item-1']

	from lxml import etree
	html = etree.parse('./test.html', etree.HTMLParser())
	result = html.xpath('//li[@class="item-0"]')
	print(result)

	# 输出：
	[<Element li at 0x115357d08>]

	from lxml import etree
	html = etree.parse('./test.html', etree.HTMLParser())
	# 用@可以获取属性
	result = html.xpath('//li/a/@href')
	print(result)

	# 输出：
	['linkl.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']

	from lxml import etree

	# 这里的HTML文本中的li节点的class属性有两个值li和li-first
	text = '''
	<li class="li li-first"><a href="link.html">first item</a></li>
	'''
	html = etree.HTML(text)
	# 获取text中的所有li节点中class属性是li的文本
	result = html.xpath('//li[contains(@class, "li")]/a/text()')
	print(result)

	# 输出：
	['first item']

	from lxml import etree
	text = '''
	<li class="li li-first" name="item"><a href="link.html">first item</a></li>
	'''
	html = etree.HTML(text)
	result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
	print(result)

	# 输出：
	['first item']

	from lxml import etree
	text = '''
	<div>
	<ul>
	<li class="item-O"><a href="linkl.html">first item</a></li>
	<li class="item-1"><a href="link2.html">second item</a></li>
	<li class="item-inactive"><a href="link3.html">third item</a></li>
	<li class="item-1"><a href="link4.html">fourth item</a></li>
	<li class="item-0"><a href="link5.html">fifth item</a>
	</ul>
	</div>
	'''
	html = etree.HTML(text)
	result1 = html.xpath('//li[1]/a/text()') # 选取第一个li节点
	result2 = html.xpath('//li[last()]/a/text()') # 选取最后一个li节点
	result3 = html.xpath('//li[position()<3]/a/text()') # 选取位置小于3的li节点
	result4 = html.xpath('//li[last()-2]/a/text()') # 选取倒数第3个li节点

	print(result1, result2, result3, result4, sep='\n')


	# 输出：
	['first item']
	['fifth item']
	['first item', 'second item']
	['third item']