简介
XPath 是一门在 XML 文档中查找信息的语言。XPath 可用来在 XML 文档中对元素和属性进行遍历。XPath 是 W3C XSLT 标准的主要元素,并且 XQuery 和 XPointer 都构建于 XPath 表达之上。
安装
pip install lxml
本文示例的html代码
| <div> |
| <div> |
| <ul> |
| <li class="item-0"> |
| <a href="link1.html">first item</a> |
| </li> |
| <li class="item-1"> |
| <a href="link2.html">second item</a> |
| </li> |
| </ul> |
| </div> |
| <div id="111"> |
| <div class="item-1"> |
| <a href="www.qq.com">qq.com</a> |
| <p>this is p label</p> |
| <ul> |
| <li class="item-2"> |
| <a href="link1.html">first item1</a> |
| </li> |
| <li class="item-3"> |
| <a href="link2.html">second item2</a> |
| </li> |
| </ul> |
| </div> |
| <a href="www.baidu.com">baidu.com</a> |
| </div> |
| </div> |
使用
实例化etree
| from lxml import etree |
| |
| |
| html = etree.parse("file_path") |
| |
| html = etree.HTML(resp.text) |
| result = etree.tostring(html) |
xpth表达式
定位
根据层级定位
- / :表示从根节点开始定位
- // :表示多个层级,可以从任意位置开始定位
- ./:从当前位置开始定位
根据属性进行定位
| text = html.xpath("/div[1]//li[@class='item-0']") |
根据id进行定位
| text = html.xpath("//div[@id='111']") |
根据索引号进行定位
| text = html.xpath("/div/div[1]/ul/li[2]/a/text()") |
取值
获取文本
- 该节点下的直系文本:
/text()
- 该节点下的所有文本:
//text()
| from lxml import etree |
| |
| wb_data = """ |
| <div> |
| <div> |
| <ul> |
| <li class="item-0"> |
| <a href="link1.html">first item</a> |
| </li> |
| <li class="item-1"> |
| <a href="link2.html">second item</a> |
| </li> |
| </ul> |
| </div> |
| <div id="111"> |
| <div class="item-1"> |
| <a href="www.qq.com">qq.com</a> |
| <p>this is p label</p> |
| <ul> |
| <li class="item-2"> |
| <a href="link1.html">first item1</a> |
| </li> |
| <li class="item-3"> |
| <a href="link2.html">second item2</a> |
| </li> |
| </ul> |
| </div> |
| <a href="www.baidu.com">baidu.com</a> |
| </div> |
| </div> |
| """ |
| html = etree.HTML(wb_data) |
| |
| text = html.xpath("//div[@id='111']//text()") |
| print([i.strip() for i in text]) |
| |
| print(html.xpath("//li[@class='item-0']/a/text()")[0]) |
获取属性
/@属性名称
:获取该节点下的直系属性值
//@属性名称
:获取该节点下的所有属性值
| from lxml import etree |
| |
| wb_data = """ |
| <div> |
| <div> |
| <ul> |
| <li class="item-0"> |
| <a href="link1.html">first item</a> |
| </li> |
| <li class="item-1"> |
| <a href="link2.html">second item</a> |
| </li> |
| </ul> |
| </div> |
| <div id="111"> |
| <div class="item-1"> |
| <a href="www.qq.com">qq.com</a> |
| <p>this is p label</p> |
| <ul> |
| <li class="item-2"> |
| <a href="link1.html">first item1</a> |
| </li> |
| <li class="item-3"> |
| <a href="link2.html">second item2</a> |
| </li> |
| </ul> |
| </div> |
| <a href="www.baidu.com">baidu.com</a> |
| </div> |
| </div> |
| """ |
| html = etree.HTML(wb_data) |
| |
| print(html.xpath("//div[@class='item-1']//@href")) |
| |
| print(html.xpath("//div[@id='111']/a/@href")[0]) |
实例
首先自制了一个多线程爬虫模块用于发送请求,模块名称为MyModule
| import threading, queue |
| |
| """爬虫多线程""" |
| |
| |
| class SpiderThread(threading.Thread): |
| |
| def __init__(self) -> "里面包含了请求头和代理IP 代理ip自己设置": |
| super().__init__(daemon=True) |
| |
| self.queue = queue.Queue() |
| |
| |
| self.start() |
| |
| try: |
| |
| file = open("./ip.txt", "r") |
| ipList = file.readlines() |
| file.close() |
| import random |
| self.ip = random.choice(ipList).strip() |
| except Exception as e: |
| print(f"没有批量ip地址,使用本机ip地址{e}") |
| import socket |
| s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
| s.connect(('8.8.8.8', 80)) |
| import random |
| self.ip = s.getsockname()[0] + f":{random.randint(1, 8080)}" |
| s.close() |
| |
| |
| self.headers = { |
| 'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36" |
| } |
| self.proxy = { |
| "http": f"https://{self.ip}", |
| |
| } |
| |
| def run(self) -> None: |
| while True: |
| func, args, kwargs = self.queue.get() |
| func(*args, **kwargs) |
| self.queue.task_done() |
| |
| |
| def submit_task(self, func, args=(), kwargs={}): |
| self.queue.put((func, args, kwargs)) |
| |
| |
| def join(self): |
| self.queue.join() |
| |
| |
| def crawl(url, lis, cookies=None, headers=SpiderThread().headers, |
| proxy=SpiderThread().proxy) -> "lis用来存储返回的resp响应 其是发送get请求": |
| import requests |
| if not isinstance(cookies, dict): |
| resp = requests.get(url=url, headers=headers, proxies=proxy) |
| else: |
| resp = requests.get(url=url, headers=headers, cookies=cookies) |
| if resp.status_code == 200: |
| print("获取完成,返回的数据在传入的列表里面") |
| lis.append(resp) |
| else: |
| SpiderThread().submit_task(crawl, args=(i, lis)) |
| |
| from lxml import etree |
| import MyModule |
| from concurrent.futures import ThreadPoolExecutor |
| |
| spider = MyModule.SpiderThread() |
| |
| |
| """ |
| 通过分析url可得到 url = 'https://nc.58.com/yewu/pu1/?key=%E9%94%80%E5%94%AE'; |
| 又第二页的 url = 'https://nc.58.com/yewu/pn2/?key=%E9%94%80%E5%94%AE' |
| """ |
| |
| |
| |
| def spider1(): |
| resp = [] |
| url = "https://nc.58.com/yewu/?key=%E9%94%80%E5%94%AE" |
| spider.submit_task(MyModule.crawl, args=(url, resp)) |
| spider.join() |
| page_source = resp[0].text |
| html = etree.HTML(page_source) |
| num = int(html.xpath("/html/body/div[3]/div[3]/div/div/div/span[2]/text()")[0]) |
| return [f"https://nc.58.com/yewu/pn{i}/?key=%E9%94%80%E5%94%AE" for i in range(1, num)] |
| |
| |
| def crawl(): |
| respAll = [] |
| for i in spider1(): |
| spider.submit_task(MyModule.crawl, args=(i, respAll)) |
| spider.join() |
| return [i.text for i in respAll] |
| |
| |
| def save(resp_text): |
| html = etree.HTML(resp_text) |
| torr = html.xpath("//*[@id='list_con']/li") |
| file = open("./a.txt", "a+", encoding="utf-8") |
| for i in torr: |
| temp = i.xpath("./div[1]//a//text()") |
| name = "".join(temp) |
| file.write(f"名称:{name}") |
| file.close() |
| |
| |
| def main(respAll): |
| with ThreadPoolExecutor(50) as pool: |
| pool.map(save, respAll) |
| |
| |
| if __name__ == '__main__': |
| main(crawl()) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?