BS4的使用
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"id="id_p"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
pip3 install lxml
soup = BeatifulSoup(html_doc,'lxml')
# 深度优先找第一个
1.用法(通过.查找,只能找到第一个)
soup.head
2.获取标签的属性
soup.head.attrs
3.获取标签的内容
soup.p.strings
soup.p.string
soup.p.text # 递归所有文本
4.子节点、子孙节点
soup.p.children # 迭代器
soup.p.contents # 列表
5.父节点、祖先节点
soup.p.parent # 直接父节点
soup.p.parents # 祖先节点
6.兄弟节点
soup.a.next_sibling
soup.a.previous_sibling
soup.a.next_siblings
soup.a.previous_siblings
'''
查找文档树
'''
五种过滤器
字符串、正则表达式、列表、布尔、自定义方法
1.字符串过滤
soup.find(name='p')
soup.find(name='body')
soup.find_all(class_='title')
soup.find_all(href='http://www.baidu.com')
soup.find_all(id='id_p')
2.正则过滤
import re
reg = re.compile('^b')
soup.find_all(name=reg)
3.列表
soup.find_all(name=['body','b'])
4.布尔
soup.find_all(name=True)
soup.find__all(id=True)
5.自定义方法
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
6.其他使用
soup.find_all(attrs={'class':'title'})
soup.find_all(attrs={"id":'id_p1',"class":'title'})
7.limit
soup.find_all(name=True,limit=2)
8.recursive递归
soup.body.find_all(name='p',recursive=False)
'''