https://www.cnblogs.com/yuanchenqi/articles/7617280.html
http://beautifulsoup.readthedocs.io/zh_CN/latest/
简单来说,Beautiful Soup是python的一个库,最主要的功能是从网页解析数据。
pip3 install beautifulsoup4
# 只是针对 标签 字符串 做判断做匹配用的,只处理标签字符串
# 爬虫,会专门用来 解析数据
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister1" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister2" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister3" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<script>alert(111)</script>
<script>alert('bbb')</script>
"""
soup = BeautifulSoup(html_doc,'html.parser')
print(soup.a) # 第一个a 标签 对象
print(soup.a.name) # a
print(soup.p) # 第一个p 标签 对象
print(soup.p.name) # p
print(soup.p.string) # 文本值
print(soup.p.text) # 文本值
print(soup.p['class']) # title
# print(soup.p['id'])
print(soup.p.attrs)
print(soup.p.attrs.get('class'))
# find find_all
print(soup.find('p')) # 第一个p标签对象 列表
print(soup.find('p').attrs) # {'class': ['title']} 标签对象才能 .
print(soup.find_all('p')) # 拿到所有的p标签
for tag in soup.find_all('p'):
# print(tag.attrs)
print(tag.get('class'))
print(tag.get('id'))
print('------------------')
# class == sister 注意点: class_
print(soup.find_all(id='link1')) # 可以写id 找到对象
print(soup.find_all(class_='sister')) # 不能写class 因为 class 是关键字
# 找出 所有的 a 和 p
print(soup.find_all(['a','p']))
print('------------------')
# 修改属性
print(soup.find(id='link1'))
tag = soup.find(id='link1')
tag.attrs['class'] = "brother"
print(soup.find(id='link1'))
print('------------------')
# 删除属性
print(soup.find(id='link1'))
tag = soup.find(id='link1')
del tag.attrs['class']
print(soup.find(id='link1'))
print('------------------')
# 还可以用 正则!
print(soup.find_all('b'))
import re
print(soup.find_all(re.compile('b')))
print('--------beautifulsoup 开的 css 的接口选择器 css 属性查找----------')
# select 返回的 是 列表
# a.sister
print(soup.select('a.sister'))
ret = soup.select('.story .sister2')
print(ret)
print('------------Tag.decompose()----------------')
# 修改文档树
# Tag.decompose() 方法将当前节点 移除文档树并完全销毁
for tag in soup.find_all():
if tag.name == 'a':
tag.decompose() # 删除了所有的a标签
print(soup)
print('------------soup.find_all()----------------')
print(soup.find_all())
"""
print(soup.find_all()) 拿到所有 什么叫所有?
<div>
<div>111</div>
<div>222</div>
<div>333</div>
</div>
<p>567</p>
[<div>
<div>111</div>
<div>222</div>
<div>333</div>
</div>, <div>111</div>, <div>222</div>, <div>333</div>, <p>567</p>]
"""