python 基础 6 解析之xpath

解析
xpath使用
安装lxml库
pip install lxml ‐i https://pypi.douban.com/simple
pip install lxml -i https://mirrors.aliyun.com/pypi/simple
导入lxml etree
from lxml import etree
etree.parse() 解析本地文件
html_tree = etree.parse('XX.html')
etree.parse() 解析本地文件
html_tree = etree.parse('XX.html')
etree.HTML() 服务器响应文件
html_tree = etree.HTML(response.read().decode('utf‐8')
html_tree.xpath(xpath路径)

xpath基本语法
 1、路径查询
    //:查找所有子孙节点,不考虑层级关系
    /:查找子节点
 2、谓词查询
    //div[@id]
 3、属性查询
    //@class
 4、模糊查询
    //div[contains(@id,"he")]
    //div[starts-with(id,"he")]
 5、内容查询
    //div/hi/text()
 6、逻辑查询
    //div[@id=:"head" and @class="s_down"]
tree = etree.HTML(context)
result = tree.xpath("//input[@id='su']/@value")[0]
print(result)

抓取素材图片案例


# https://sc.chinaz.com/tupian/shuaigetupian.html

import urllib.request
import lxml.etree



headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
}


def create_request(page):
    if page == 1:
        url = "https://sc.chinaz.com/tupian/shuaigetupian.html"
    else:
        url = "https://sc.chinaz.com/tupian/shuaigetupian_"+str(page)+".html"

    print(url)
    requestObj = urllib.request.Request(headers=headers,url=url)
    return requestObj


def get_context(requestObj):
    context = urllib.request.urlopen(requestObj)
    result = context.read().decode('utf-8')
    return result


def down_load(context):
    tree = lxml.etree.HTML(context)
    name_list = tree.xpath("//div[@id='container']//a/img/@alt")
    pic_url_list = tree.xpath("//div[@id='container']//a/img/@src2")
    for index in range(len(name_list)):
        urllib.request.urlretrieve(url="https:"+pic_url_list[index],filename="./img/"+name_list[index]+'.jpg')


if __name__ == '__main__':
    start_page = int(input("请输入起始页"))
    end_page = int(input("请输入结束页"))

    for page in range(start_page,end_page+1):
        requestObj = create_request(page)
        context = get_context(requestObj)
        down_load(context)

import  urllib.request
import  urllib.parse
from lxml import etree
def create_request(page):


    base_url = 'https://sc.chinaz.com/tupian/meinvtupian'

    if page == 1:
            url = url = base_url + '.html'
    else:
            url = base_url +'_' + str(page) + '.html'
    #定制请求对象
    request = urllib.request.Request(url)

    return  request
def get_response(request):

    response = urllib.request.urlopen(request)

    context = response.read().decode('utf-8')


    #解析
    tree = etree.HTML(context)
    result_url = tree.xpath('//div[@class="item"]/img/@data-original')
    result_name = tree.xpath('//div[@class="item"]/img/@alt')
    # result = tree.xpath('/html/body/div[3]/div[2]/div/img/@data-original')
    print(result_url,result_name)
    return result_name,result_url
def downlaod(result_name,result_url):
    name_list = 0
    for index in range(len(result_name)):
        # print(result_url[index])
        urllib.request.urlretrieve(url="https:"+result_url[index],filename="./pic/"+ result_name[index]+'.jpg' )


if __name__ == '__main__':
    start_page = int(input('请输入起始页'))
    end_page = int(input('请输入结束页'))
    for page in range(start_page,end_page+1):
        request = create_request(page)
        result_name,result_url = get_response(request)
        downlaod(result_name,result_url)

posted @ 2024-07-11 20:11  donghongchao  阅读(2)  评论(0编辑  收藏  举报