爬虫 基础 BS详解

Beautifulsoup 库详解

# -*- coding:utf8 -*-

# 工程路径:3.3 beautifulsoup库.py

# 工程日期:9/6/2019

# 工程目标:beautifulsoup使用详解

"""

bs支持lxml, HTML 解析, html5解析

 

"""

#%%

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

 

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.prettify()) # 格式化html

print(soup.title.string) # 输出 title中内容

 

#%% 标签选择器

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title)

print(type(soup.title)) # 为bs4的元素tag类型

print(soup.head)

print(type(soup.head))

print(soup.p) # 只返回第一个匹配的p标签

 

#%% 获取标签的名称

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title.name) # 获取标签的名称

print(soup.p.name) # 获取p标签的名称

 

#%% 获取标签的属性

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p['name'])

print(soup.p.attrs['name']) # 获取属性

 

#%% 获取标签内的文本内容 .string

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.string) # 获取标签内的文本内容

 

#%% 标签的嵌套选择

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.head.title.string)

print(soup.body.p)

print(soup.body.a['href'])

print(soup.body.a['class'])

print(soup.body.a['id'])

 

#%% 子节点以及子孙节点的选择

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

<p class="story">

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.body.p.a['href'])

#print(soup.p.contents)

print(type(soup.p.contents))

for i in soup.p.contents:

print(i)

 

#%% .children 获取子节点 迭代器类型,

# 使用循环的方式才能取出内容

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

<p class="story">

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.children)

for i, child in enumerate(soup.p.children):

print(i, child)

 

#%% .descendents 获取所有的子孙节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

<p class="story">

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.descendants)

for i, decendant in enumerate(soup.p.descendants):

print(i, decendant) # 输出p标签的所有的子孙节点

 

#%% .parent父节点 .parents祖先节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

<p class="story">

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.a.parent)

print(type(soup.a.parent)) # 父节点为 标签类

print(type(soup.a.parents)) # 祖先节点为 迭代器

for i, pars in enumerate(soup.a.parents):

print(i, pars)

 

print(list(enumerate(soup.a.parents))) # list 输出

 

#%% 获取兄弟并列的节点

# .next_siblings 下一个兄弟节点

# .previous_siblings 前一个兄弟界节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

<p class="story">

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.a.next_siblings) # 迭代器的类型的返回

print(list(enumerate(soup.a.next_siblings)))

print(list(enumerate(soup.a.previous_siblings)))

 

"""

以上的选择方式为表标签的选择方式,方式快,但是满足不够

"""

#%% 标准选择器 find_all 根据标签名, 属性,选择标签 列表返回

# find_all (name, attrs, recursiv, text, **kwargs)

 

#%% 标签名选择 name

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all('ul'))

print(type(soup.find_all('ul')))

print(soup.find_all('ul')[0])

 

#%% 循环嵌套 find_all

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.find_all('ul'):

print(soup.find_all('li')) # 循环嵌套的方式查找 ul 标签中的li标签

 

#%% attrs 属性查找对应的内容

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1" name="elements">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(attrs={'id':'list-1'})) # 匹配所有符合该 属性的标签内容

print(soup.find_all(attrs={'name':'elements'})) # 两次的匹配结果实际上一致

 

 

## 更简单的写法

print(soup.find_all(id='list-1'))

print(soup.find_all(class_= 'list')) # 再这个地方class 为关键字, 因此加下划线来进行区分

 

 

#%% 使用文本的内容进行匹配 text

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

 

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(text='Foo')) # 有两处满足 输出文本, 不输出完整的标签

 

#%% find 方法 只返回单个匹配的元素, 不返回所有的结果

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find('ul')) # 值返回第一个匹配的结果

 

#%% 其他选择, 类似于标签的选择

"""

### find_parents() find_parent()

 

find_parents()返回所有祖先节点,find_parent()返回直接父节点。

 

### find_next_siblings() find_next_sibling()

 

 

### find_previous_siblings() find_previous_sibling()

 

find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。

 

### find_all_next() find_next()

 

find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点

 

### find_all_previous() 和 find_previous()

 

find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

 

"""

 

 

#%% CSS 选择器 通过select 直接传入CSS选择器 就可以完成标签或者元素的的选择

# . 代表 class

# # 代表 id

# 空格 代表嵌套

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.select('.panel .panel-heading')) # . 选择class

print(soup.select('.panel .panel-body'))

 

print(soup.select('ul li'))

print(type(soup.select('ul li'))) # 嵌套使用空格进行选择 选择后的对象为list

print(list(enumerate(soup.select('ul li'))))

print(list(soup.select('ul li')))

print(soup.select('#list-2')) # 通过 # 选择id 选出所有 id 符合条件的标签

print(soup.select('#list-1 .element')) # 通过 # id 选择 然后嵌套选出 class 为element的标签

 

#%% for嵌套选择

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(soup.select('li')) # 通过for循环嵌套选出 li 标签

print(soup.select('ul li')) # 使用空格 具有同等效果的嵌套

 

#%% select 获取标签的属性 [ ]

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(ul['class'])

print(ul.attrs['id']) # 使用 [ ] 的这两种方式都可以获取 标签的 id

 

#%% get_text 获取标签标签中的内容

html='''

<div class="panel">

<div class="panel-heading">

<h4>Hello</h4>

</div>

<div class="panel-body">

<ul class="list" ok hah id="list-1">

<li class="element">Foo</li>

<li class="element">Bar</li>

<li class="element">Jay</li>

</ul>

<ul class="list list-small" id="list-2">

<li class="element">Foo</li>

<li class="element">Bar</li>

</ul>

</div>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for li in soup.select('li'):

print(li.get_text('.title'))

 

 

#%% 示例

import requests

from bs4 import BeautifulSoup

 

html = requests.get('https://book.douban.com').text

#print(html.text)

soup = BeautifulSoup(html, 'lxml')

# print(soup.prettify())

for title in soup.select('.title '):

#print(soup.select('a'))

for a in soup.select('a'):

print(a['href'])

print(a.get_text())

posted @ 2019-06-10 01:54  binyang  阅读(791)  评论(0编辑  收藏  举报