Beautiful Soup

https://www.cnblogs.com/yuanchenqi/articles/7617280.html
http://beautifulsoup.readthedocs.io/zh_CN/latest/

简单来说,Beautiful Soup是python的一个库,最主要的功能是从网页解析数据。

pip3 install beautifulsoup4

# 只是针对 标签 字符串 做判断做匹配用的,只处理标签字符串

# 爬虫,会专门用来 解析数据
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister1" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister2" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister3" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
<script>alert(111)</script>
&lt;script&gt;alert('bbb')&lt;/script&gt;
"""

soup = BeautifulSoup(html_doc,'html.parser')

print(soup.a)       # 第一个a 标签 对象
print(soup.a.name)  # a
print(soup.p)       # 第一个p 标签 对象
print(soup.p.name)  # p

print(soup.p.string) # 文本值
print(soup.p.text)   # 文本值

print(soup.p['class']) # title
# print(soup.p['id'])
print(soup.p.attrs)
print(soup.p.attrs.get('class'))

# find  find_all
print(soup.find('p'))        # 第一个p标签对象  列表
print(soup.find('p').attrs)  # {'class': ['title']}  标签对象才能 .
print(soup.find_all('p'))    # 拿到所有的p标签

for tag in soup.find_all('p'):
    # print(tag.attrs)
    print(tag.get('class'))
    print(tag.get('id'))


print('------------------')
# class == sister  注意点: class_
print(soup.find_all(id='link1'))       # 可以写id 找到对象
print(soup.find_all(class_='sister'))  # 不能写class 因为 class 是关键字

# 找出 所有的 a 和 p
print(soup.find_all(['a','p']))


print('------------------')
# 修改属性
print(soup.find(id='link1'))
tag = soup.find(id='link1')
tag.attrs['class'] = "brother"
print(soup.find(id='link1'))


print('------------------')
# 删除属性
print(soup.find(id='link1'))
tag = soup.find(id='link1')
del tag.attrs['class']
print(soup.find(id='link1'))

print('------------------')
# 还可以用 正则!
print(soup.find_all('b'))
import re
print(soup.find_all(re.compile('b')))


print('--------beautifulsoup 开的 css 的接口选择器 css 属性查找----------')
# select 返回的 是 列表
# a.sister
print(soup.select('a.sister'))

ret = soup.select('.story .sister2')
print(ret)


print('------------Tag.decompose()----------------')
# 修改文档树
# Tag.decompose() 方法将当前节点 移除文档树并完全销毁

for tag in soup.find_all():
    if tag.name == 'a':
        tag.decompose()  # 删除了所有的a标签

print(soup)


print('------------soup.find_all()----------------')
print(soup.find_all())
"""
print(soup.find_all()) 拿到所有 什么叫所有? 

<div>
    <div>111</div>
    <div>222</div>
    <div>333</div>
</div>
<p>567</p>


[<div>
<div>111</div>
<div>222</div>
<div>333</div>
</div>, <div>111</div>, <div>222</div>, <div>333</div>, <p>567</p>]

"""
posted @ 2018-06-09 13:46  Alice的小屋  阅读(108)  评论(0编辑  收藏  举报