xpath解析
from lxml import etree # 获取本地文件 tree = etree.parse('bendi.html') print(tree) # /表示子元素,//表示子孙后代元素 li = tree.xpath('//body/ul/li') print(li) print(len(li))
# 获取有id的li liid = tree.xpath('//body/ul/li[@id]/text()') for i in liid: print(i)
# 获取id为bj的li libj = tree.xpath('//body/ul/li[@id="bj"]/text()') print(libj)
获取属性;
属性查询
//@class
获取百度一下:
url = 'http://www.baidu.com' response = urllib.request.urlopen(url) content = response.read().decode('utf-8') tree1 = etree.HTML(content) val = tree1.xpath('//input[@id="su"]//@value') print(val[0])