python3爬虫 xpath实操备忘

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from lxml import etree
import requests
import logging
logging.captureWarnings(True)

def get_html(url):
    headers = {     # 只适合知乎，包括了用户信息
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    "Referer": "https://www.zhihu.com/signup?next=%2F",
    "Host": "www.zhihu.com",
    "Upgrade-Insecure-Requests": "1"
    }
    r = requests.get(url, headers = headers, verify = False, timeout = 10)
    html = r.text
    return html

def parse_html(text,url):
#*******直接对html文件地址进行解析操作***********
#    html = etree.parse(url, etree.HTMLParser())
#    result = etree.tostring(html)
#    print(result.decode("utf-8"))
#*******对源代码进行解析操作***********
    html = etree.HTML(text)     # 构造一个xpath对象，同时自动修正html文本
#    result = etree.tostring(html) # tostring()输出修正后的html代码，但是bytes类型
#    print(result.decode("utf-8"))   # 结果不理想，仍然是bytes类型
    result1 = html.xpath("//*") # 获取所有节点名称,列表形式
    result2 = html.xpath("//div")    # 获取所有div节点名称 <Element div at 0x7f49e9faa6c8>,列表形式
    result3 = html.xpath("//div/a") # 获取所有div节点子节点是a的节点名称
    result4 = html.xpath("//h2//a") # 获取所有h2节点底下是a的子孙节点名称
    result5 = html.xpath("//a/../@itemprop") # 获取所有节点a的父节点，其属于是itemprop的值
    result6 = html.xpath("//a/parent::*/@itemprop") # 获取父节点的另一种方式
    result7 = html.xpath('//a[@data-za-detail-view-element_name="Title"]') # 属性匹配，获得所有对应属性值的a节点
    result8 = html.xpath('//a[@data-za-detail-view-element_name="Title"]/text()') # /获得对应节点下面文本,//获取对应节点底下所有子孙的节点文本
    result9 = html.xpath("//a/@href")    # 获取所有a节点的href属性值
    result10 = html.xpath("//div[contains(@class,'ContentItem')]//a/text()") # 属性多值匹配。contains（参数1，参数2）；返回所有参数1字符串包含第二个参数字符串的内容
    result11 = html.xpath('//div[contains(@class,"ContentItem")] and @name="532363164"') # 多属性匹配
    result12 = html.xpath('//li[1]/li[last()]/li[position()<3]/li[last()-2]') # 按序选择。[n],从1开始。。last()：最后一个。position()<3:取1和2个li; last(）-2,取倒数第3个
    result13 = html.xpath('//li[1]/ancestor::*') # ancestor：祖宗节点； * 所有的
    result14 = html.xpath('//li[1]/ancestor::div')
    result15 = html.xpath('//li[1]/attribute::*') # attribute：属性值； * 所有的
    result16 = html.xpath('//li[1]/child::*') # child：子节点； * 所有的
    result17 = html.xpath('//li[1]/descendant::*')    # descendant：子孙节点； * 所有的
    result18 = html.xpath('//li[1]/following::*[2]') # following：当前节点之后的节点； * 所有的; [2]:代表第2个
    result19 = html.xpath('//li[1]/following-sibling::*')     # following-sibling：同级节点； * 所有的

#    print(result)

if __name__ == "__main__":
    url = "https://www.zhihu.com/"
    html = get_html(url)
    parse_html(html,url)

posted @ 2019-01-18 11:57 挖坑达人阅读(3) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

挖坑达人

python3爬虫 xpath实操备忘

公告