import re
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
# print(soup.prettify()) # 美化
print(soup.p.attrs)
"""
节点选择器
"""
# 选择元素
print(type(soup.title)) # <class 'bs4.element.Tag'>
print(soup.title.string) # The Dormouse's story
print(soup.head) # <head><title>The Dormouse's story</title></head>
print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
# 提取信息
# 获取名称
print(soup.title.name) # title
print(soup.p.name) # p
# 获取属性
print(soup.p.attrs) # {'class': ['title'], 'name': 'dromouse'}
print(soup.p.attrs['name']) # dromouse
print(soup.p['name']) # dromouse
print(soup.p['class']) # ['title']
# 获取内容
print(soup.p.string) # The Dormouse's story
# 嵌套选择
print(soup.head.title.string) # The Dormouse's story
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
# 关联选择
# (1)子节点和子孙节点
# contents属性得到的结果是直接子节点的列表
print('(1)子节点和子孙节点')
print(soup.p.contents)
# children属性也可以得到直接子节点
print(soup.p.children) # <list_iterator object at 0x7fdf9fa12820>
for i, child in enumerate(soup.p.children):
print(i, child) # 0 <b>The Dormouse's story</b>
"""
# 获取直接子节点,span是在a里面的
0 Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2
3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4 and
5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6
and they lived at the bottom of a well.
"""
# descendants属性获取所有的子孙节点
print(soup.p.descendants) # <generator object Tag.descendants at 0x7fdf9fa9bb30>
for i, child in enumerate(soup.p.descendants):
print(i, child)
"""
# a 下的<span>Elsie</span>标签被单独输出
0 Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2 <span>Elsie</span>
3 Elsie
4
5 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
6 Lacie
7 and
8 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
9 Tillie
10
and they lived at the bottom of a well.
"""
# 父节点和祖父节点
print(soup.a.parent) # 直接父节点
print(soup.a.parents) # 所有祖父节点
# 兄弟节点
print(soup.a.next_sibling) # 上一个兄弟节点
print(soup.a.previous_sibling) # 下一个兄弟节点
print(soup.a.next_siblings) # 后面的兄弟节点生成器
print(soup.a.previous_siblings)
# 提取信息
print(soup.a.string) # 获取文本
print(list(soup.a.parent)[1].attrs['class']) # 获取属性
# print(soup.find_all('p', attrs={'class': 'title'})[0].get_text())
"""
方法选择器
"""
# find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul')[0])
print(type(soup.find_all(name='ul')[0])) # <class 'bs4.element.Tag'>
# 查询出所有ul节点后,再继续查询其内部的li节点
for ul in soup.find_all(name='ul'):
print(ul.find_all(name='li'))
# [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
# [<li class="element">Foo</li>, <li class="element">Bar</li>]
# attrs 查询属性
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
# 常用属性id, class
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element'))
# text参数进行节点的文本匹配
print(soup.find_all(text=re.compile('Foo')))
# 其它方式:
# soup.find()
# soup.find_parent()
# soup.find_parents()
# soup.find_next_sibling()
# soup.find_next_siblings()
# soup.find_previous_sibling()
# soup.find_all_next()
# soup.find_all_previous()
# ....
"""
css选择器
"""
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(soup.select_one('ul')['id']) # list-1
print(soup.select_one('ul').attrs['id']) # list-1
print(soup.select_one('li').get_text()) # Foo
print(soup.select_one('li').string) # Foo