BeautifulSoup

 

#pip3 install beautifulsoup4

from bs4 import BeautifulSoup
soup = BeautifulSoup(comment,"html.parser") #html.parser 是一个内置的解析器,BeautifulSoup会根据html.parser把html解析为一个个对象
comment ="""
    <p id="i1">
        我是中国人
    </p>
    <p >
       <script>alert(123)</script>
    </p>
    <p id="i2">
        <span>我是中国人</span>
    </p>
    <p>
        <br />
    </p>
    <p id="i3">
        <span>我是中国人</span><img src="/static/images/1.jpg" alt="" />
    </p>

"""

#pip3 install beautifulsoup4

from bs4 import BeautifulSoup
soup = BeautifulSoup(comment,"html.parser") #html.parser 是一个内置的解析器,BeautifulSoup会根据html.parser把html解析为一个个对象

# tag = soup.find(name="span")    #找第一个标签
# print(tag)

# obj = soup.find(attrs={"id":"i2"})  #查找属性 查找第一个
# print(obj)

# obj = soup.find(name="p",attrs={"id":"i2"})  #并且
# print(obj)


# obj = soup.find_all(name="p") #查找属性 查找所有
# print(obj)



#查找所有内容,匹配到的清空内容,不删除标签clear()
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
#     if tag.name not in valid_tag:
#         tag.clear()
# print(soup)


#查找所有内容,匹配到的删除标签
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
#     if tag.name not in valid_tag:
#         tag.decompose()
# print(soup)

#取到的soup是对象,转换成字符串
# print(soup.decode())


#限制某个标签的属性,不在的属性从标签中删除
valid_tag = {
    "p":["class","id"],
    "img":["src"],
    "div":["class"],
}

tags = soup.find_all()
for tag in tags:
    if tag.name not in valid_tag:
        tag.decompose()
    if tag.attrs:
        #print(tag.attrs)        #获取所有标签的属性
        for k in list(tag.attrs.keys()):
            if k not in valid_tag[tag.name]:
                del tag.attrs[k]
content_str = soup.decode()
print(content_str)
过滤演示.py

 

  

posted @ 2017-07-20 18:03  golangav  阅读(499)  评论(0编辑  收藏  举报