python 包之 PyQuery 网页解析教程
一、安装
-
是一个非常强大又灵活的网页解析库
-
PyQuery 是 Python 仿照 jQuery 的严格实现
-
语法与 jQuery 几乎完全相同,更多操作可以参考jQuery
pip install pyquery
二、字符串初始化
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))
三、url初始化
from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')
四、文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='index.html')
print(doc)
五、css选择器
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .fadeIn'))
六、查找子元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)
七、兄弟元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())
八、获取属性
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)
九、获取文本
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a').text()
print(a)
十、类操作
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)
标签:
python包教程
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通