xpath

from lxml import etree

# 实例化一个etree对象
# 读取本地文档
tree = etree.parse('html_prac.html')
# 读取网络文档
# html = etree.HTML(html)

'''
# 子节点/和子孙节点//定位
# 从根节点进行定位
tree1 = tree.xpath('/html/body/div')
# # 跳过一个子节点
tree2 = tree.xpath('/html//div')
# # 直接在子节点定位
tree3 = tree.xpath('//div')
print(tree1)
print(tree2)
print(tree3)
'''

'''
# 属性定位<div class="song">
tree = tree.xpath('//div[@class="song"]')
print(tree)
'''

'''
# 标签定位p
tree = tree.xpath('//div[@class="song"]/p')
print(tree)
'''

'''
# 定位到李清照哪个p标签,索引从1开始
tree = tree.xpath('//div[@class="song"]/p[1]')
print(tree)
'''

'''
# 输出列表:李清照
tree = tree.xpath('//div[@class="song"]/p[1]/text()')
print(tree)
# 输出的是文本
tree = tree.xpath('//div[@class="song"]/p[1]/text()')[0]
print(tree)
'''

'''
# 取出所有li下的文字
# tree = tree.xpath('//li//text()')
tree1 = tree.xpath('//li//text()')[0]
tree2 = tree.xpath('//li//text()')[1]
tree3 = tree.xpath('//li//text()')[2]
tree4 = tree.xpath('//li//text()')[3]
print(tree)
print(tree1)
print(tree2)
print(tree3)
print(tree4)
'''
'''
# 取属性<li><a href="http://www.haha.com" id="feng">
# 取属性<img src="http://www.baidu.com/meinv.jpg" alt="" />
tree1 = tree.xpath('//a//@href')[-1]
tree2 = tree.xpath('//img//@src')[0]
print(tree1)
print(tree2)
'''

# 宋朝是最强大的王朝
tree = str(tree.xpath('//a//text()')[2]).strip()
print(tree)

posted @ 2021-06-16 11:14  布都御魂  阅读(61)  评论(0编辑  收藏  举报