安装lxml库
| pip --default-timeout=100 install lxml -i http: |
requests和xpath的使用
| from lxml import etree |
| import requests |
| |
| headers = {'User-Agent' : 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'} |
| |
| response = requests.get("http://www.baidu.com",headers=headers) |
| text = response.text |
| |
| |
| selector = etree.HTML(text) |
| |
| |
| t = selector.xpath('//div//text()') |
| print(t) |
xpath选择符号
| |
| |
| // |
| selector.xpath('//div//li') |
| |
| / |
| selector.xpath('//div/ul/li') |
| |
| /text() |
| selector.xpath('//ul[@id="useful"]/text()') |
| |
| //text() |
| selector.xpath('//ul[@id="useful"]//text()') |
| |
| /@属性名 |
| link = selector.xpath('//a/@href') |
| |
| //div | //p |
| selector.xpath('//a/text() | //p/text()') |
| |
| . |
| selector.xpath('./li') |
| |
| .. |
| selector.xpath('//a[text()="极客学院"]/../@id') |
| |
| * |
| selector.xpath('//div[@id="content"]/*/*/text()') |
| |
| @* |
| selector.xpath('//li[@*]/text()') |
| |
| .// |
| selector.xpath('//ul')[0].xpath('.//text()') |
| |
| //li[1] |
| selector.xpath('//li[1]//text()') |
| |
| //div/@class |
| selector.xpath('//div/@class') |
| |
| //div[not(@class='c1')] |
| selector.xpath('//div[not(@class="c1")]//text()') |
| |
| //div[not(@class='c1' and @class='number') ] |
| selector.xpath('//div[not(@class="c1" and @class="number")]//text()') |
| |
| //div[@calss='c1' or @calss='c2'] |
| selector.xpath('//div[@class="c1" or @class="c2"]//text()') |
| |
| //div[last()] |
| selector.xpath('//div[last()]/text()') |
| |
| //div[last()-1] |
| selector.xpath('//div[last()-1]/text()') |
| |
| //div[price>"5"] |
| selector.xpath('//div[@price>"5"]/@price') |
| |
| //div[text()="50"] |
| selector.xpath('//div[text()="50"]/@class') |
| |
| //ul[contains(@id,"num")] |
| selector.xpath('//div[contains(@class,"num")]/@class') |
| |
| //ul[starts-with(@id,"ur")] |
| selector.xpath('//div[starts-with(@id,"ur")]/@id') |
| |
| //a/parent::div[@id="url"] |
| selector.xpath('//a/parent::div[@id="url"]//text()') |
| |
| //b/ancestor::*/@id |
| selector.xpath('//b/ancestor::*/@id') |
| |
| //b/ancestor-or-self::*/@id |
| selector.xpath('//b/ancestor-or-self::*/@id') |
| |
| count(//div[@id="url"]) |
| selector.xpath('count(//div[@id="url"])') |
| |
| number(//div[@class="num"]/text()) |
| selector.xpath('number(//div[@class="num"]/text())') |
| |
| normalize-space(//a[@title="极客学院课程库"]//text()) |
| selector.xpath('normalize-space(//a[@title="极客学院课程库"]//text())') |
| |
| sum(//div[@class="num"]/text()) |
| selector.xpath('sum(//div[@class="num"]/text())') |
标签转换
| from lxml import etree |
| html = '' |
| with open('./webPage.txt','r',encoding='utf-8') as f: |
| html = f.read() |
| |
| selector = etree.HTML(html) |
| |
| divhtml = etree.tostring(selector.xpath('//li')[0],pretty_print='True').decode('utf-8') |
| print(divhtml) |
| |
| content = selector.xpath('//ul[@id="useful"]//text()') |
| |
| print([i for i in content if len(i.strip())>0 ]) |
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· dotnet 源代码生成器分析器入门
· ASP.NET Core 模型验证消息的本地化新姿势
· 对象命名为何需要避免'-er'和'-or'后缀
· SQL Server如何跟踪自动统计信息更新?
· AI与.NET技术实操系列:使用Catalyst进行自然语言处理
· dotnet 源代码生成器分析器入门
· 官方的 MCP C# SDK:csharp-sdk
· 一款 .NET 开源、功能强大的远程连接管理工具,支持 RDP、VNC、SSH 等多种主流协议!
· 一步一步教你部署ktransformers,大内存单显卡用上Deepseek-R1
· 一次Java后端服务间歇性响应慢的问题排查记录