12 lxml&XPath结合使用(提取数据详解)
实现:
# 1、获取所有tr标签
# 2、获取第2个tr标签
# 3、获取所有class等于even的tr标签
# 4、获取所有a标签及其属性值
# 5、获取所有的职位信息(纯文本)
1 """lxml&XPath结合使用""" 2 3 4 from lxml import etree 5 6 parser = etree.HTMLParser(encoding='utf-8') 7 html = etree.parse('test.html', parser=parser) 8 9 # 1、获取所有tr标签 10 trs = html.xpath("//tr") # xpath函数返回的是一个列表 11 for tr in trs: 12 print(tr) 13 #print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 2、获取第2个tr标签 2 tr = html.xpath("//tr[2]")[0] 3 print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 3、获取所有class等于even的tr标签 2 trs = html.xpath("//tr[@class='even']") 3 for tr in trs: 4 print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 4_1、获取所有a标签下href属性的值 2 ah = html.xpath("//a/@href") 3 for a in ah: 4 print(a) 5 # 4_2、获取拥有href属性的a标签 6 al = html.xpath("//a[@href]") 7 for a in al: 8 print(etree.tostring(a, encoding='utf-8').decode('GBK'))
1 # 5、获取所有的职位信息(纯文本) 2 positions = [] 3 trs = html.xpath("//tr[position()>1]") # 从第二个tr标签开始 4 for tr in trs: 5 # 在某个标签下执行xpath函数,获取这个标签下的子孙元素,那么在“//”前面加一个“.”,即“.//” 6 href = tr.xpath(".//a/@href")[0] # 在当前的tr标签下获取其子孙元素的a标签 7 #print(href) 8 title = tr.xpath("./td[1]//text()")[0] # 文本不在其直接子标签下 9 #print(title) 10 category = tr.xpath("./td[2]/text()")[0] 11 num = tr.xpath("./td[3]/text()")[0] 12 addr = tr.xpath("./td[4]/text()")[0] 13 14 # 把所获得信息用字典结构存储 15 position = { 16 'href': href, 17 'title': title, 18 'category': category, 19 'num': num, 20 'address': addr 21 } 22 positions.append(position) 23 24 # 输出 25 for pos in positions: 26 print(pos)
parser = etree.HTMLParser(encoding='GBK')