爬虫08-xpath语法练习

from  lxml import  etree
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse("test.html",parser=parser)
html2=etree.parse("lagou.html",parser=parser)
#html.xpath返回的是列表,大概率都是一个元素的标签

# 1.提取所有tr标签
# trs=html.xpath("//tr")
# for tr in trs:
#     print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))

#2.提取第二个tr标签
# tr=html.xpath("//tr[1]")[0]
# print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))

#3.提取table下border为1px的标签
# border=html.xpath("//table[@border='2px']")[0]
# print(etree.tostring(border,encoding="utf-8").decode("utf-8"))

#4.获取a标签下href的值
# aList=html2.xpath("//a/@href")
# for a in aList:
#     print(a)

#5.获取所有的标签内信息
trs=html.xpath("//tr[position()>1]")
positions=[]
for tr in trs:
    herf=tr.xpath(".//a/@href")#.//是在当前位置下找元素,//是全文件
    text=tr.xpath("./td[1]//text()")#tr下第一个td里面的text
    lan=tr.xpath("./td[2]//text()")
    price=tr.xpath("./td[3]//text()")
    num=tr.xpath("./td[4]//text()")
    name=tr.xpath("./td[5]//text()")
  #  alltext=tr.xpath("./td//text()")#第一个tr下所有td下的text

    position={
        "herf":herf,
        "text":text,
        "language":lan,
        "price":price,
        "num":num,
        "name":name
    }
    false={'herf': [], 'text': [], 'language': [], 'price': [], 'num': [], 'name': []}
    if position != false:
        positions.append(position)

print(positions)

  

posted @ 2020-03-11 19:42  胡辣汤王子  阅读(292)  评论(0编辑  收藏  举报