bs4解析
- 下载 --pip install bs4
示例代码-爱丽丝漫游仙境
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
- 使用bs4格式化输出代码
from bs4 import BeautifulSoup
# lxml为解析器
soup = BeautifulSoup(html_doc,"lxml")
# 格式化输出代码
print(soup.prettify())
使用方法 | 优势 | 劣势 | |
---|---|---|---|
Python标准库 | BeautifulSoup(markup, "html.parser") |
Python的内置标准库执行速度适中文档容错能力强 | Python 2.7.3 or 3.2.2)前 的版本中文档容错能力差 |
lxml HTML 解析器 | BeautifulSoup(markup, "lxml") |
速度快文档容错能力强 | 需要安装C语言库 |
lxml XML 解析器 | BeautifulSoup(markup, ["lxml-xml"])``BeautifulSoup(markup, "xml") |
速度快唯一支持XML的解析器 | 需要安装C语言库 |
html5lib | BeautifulSoup(markup, "html5lib") |
最好的容错性以浏览器的方式解析文档生成HTML5格式的文档 |
推荐使用lxml作为解析器,因为效率更高. 在Python2.7.3之前的版本和Python3中3.2.2之前的版本,必须安装lxml或html5lib, 因为那些Python版本的标准库中内置的HTML解析方法不够稳定。
一、浏览结构化数据的方法
print(soup.title)
# <ittle>The Dormouse's story</title>
print(soup.title.name)
# u'title'
print(soup.title.string)
# u'The Dormouse's story'
print(soup.p)
# <p class="title"><b>The Dormouse's story</b></p>
print(soup.p['class'])
# u'title'
print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find(id="link3"))
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print(soup.find_all('a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
(一)匹配所有a标签的href属性
for link in soup.find_all("a"):
print(link.get("href"))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
(二)获取所有文本内容
print(soup.get_text())
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
二、遍历文档树
- 以爱丽丝文档为例
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
(一)子节点
1、一个Tag可能包含多个字符串或其它的Tag,这些都是这个Tag的子节点.Beautiful Soup提供了许多操作和遍历子节点的属性。
- <head>
soup.head
>>> <head><title>The Dormouse's story</title></head>
soup.title
>>> <title>The Dormouse's story</title>
- 这是个获取tag的小窍门,可以在文档树的tag中多次调用这个方法。
- <body>标签中的第一个<p>
soup.body.p
>>> <p>The Dormouse's story</p>
- 通过点取属性的方式只能获得当前名字的第一个tag。
- soup.a
soup.a
>>> <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
2、contents和children
- tag的 contents属性可以将tag的子节点以列表的方式输出。
head_tag = soup.head
head_tag
>>> <head><title>The Dormouse's story</title></head>
head_tag.contents
>>> [<title>The Dormouse's story</title>]
title_tag = head_tag.contents[0]
title_tag
>>> <title>The Dormouse's story</title>
title_tag.contents
>>> [u'The Dormouse's story']
- 通过tag的 children生成器,可以对tag的子节点进行循环。
for child in title_tag.children:
print(child)
# The Dormouse's story
(二)父节点
- 每个tag或字符串都有父节点:被包含在某个tag中。
parent
- <head>标签是<title>
title_tag = soup.title
title_tag
# <title>The Dormouse's story</title>
title_tag.parent
# <head><title>The Dormouse's story</title></head>
(三)兄弟节点
1、以此代码为例
soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(soup.prettify())
# <html>
# <body>
# <a>
# <b>
# text1
# </b>
# <c>
# text2
# </c>
# </a>
# </body>
# </html>
--<b>标签和<c>标签是同一层:他们是同一个元素的子节点,所以<b>和<c>
# 下一个兄弟节点
soup.b.next_sibling
>>> <c>text2</c>
# 上一个兄弟节点
soup.c.previous_sibling
>>> <b>text1</b>
三、搜索文档树
find()
和 find_all()
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
soup.find_all('b')
>>> [<b>The Dormouse's story</b>]
soup.find_all(["a", "b"])
# [<b>The Dormouse's story</b>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find_all("a", class_="sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find_all("a", limit=2)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup.select("title")
# [<title>The Dormouse's story</title>]
soup.select("p nth-of-type(3)")
# [<p class="story">...</p>]
soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("html head title")
# [<title>The Dormouse's story</title>]
soup.select("head > title")
# [<title>The Dormouse's story</title>]
soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("p > a:nth-of-type(2)")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup.select("p > #link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("body > a")
# []
soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
四、重点掌握的方法
(一)获取标签属性内容
print(soup.p["class"])
print(soup.p.get("class"))
(二)获取标签文本内容
print(soup.title.string)
print(soup.title.get_text())
(三)获取a标签全部内容
print(soup.a)
print(soup.find(id="link3"))
print(soup.find(class_="sister"))
print(soup.find_all("a",class_="sister")) # 查找全部“a”,标签属性为class=“sister”
(四)找前两个a标签
print(soup.find_all("a",limit=2))
print(soup.select("a"))
print(soup.select("p > a:nth-of-type(2)"))