爬虫基础 BS详解

Beautifulsoup 库详解

# -*- coding:utf8 -*-

# 工程路径：3.3 beautifulsoup库.py

# 工程日期：9/6/2019

# 工程目标：beautifulsoup使用详解

"""

bs支持lxml， HTML 解析， html5解析

"""

#%%

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.prettify()) # 格式化html

print(soup.title.string) # 输出 title中内容

#%% 标签选择器

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title)

print(type(soup.title)) # 为bs4的元素tag类型

print(soup.head)

print(type(soup.head))

print(soup.p) # 只返回第一个匹配的p标签

#%% 获取标签的名称

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title.name) # 获取标签的名称

print(soup.p.name) # 获取p标签的名称

#%% 获取标签的属性

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p['name'])

print(soup.p.attrs['name']) # 获取属性

#%% 获取标签内的文本内容 .string

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.string) # 获取标签内的文本内容

#%% 标签的嵌套选择

html = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1"></a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.head.title.string)

print(soup.body.p)

print(soup.body.a['href'])

print(soup.body.a['class'])

print(soup.body.a['id'])

#%% 子节点以及子孙节点的选择

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

Once upon a time there were three little sisters; and their names were

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.body.p.a['href'])

#print(soup.p.contents)

print(type(soup.p.contents))

for i in soup.p.contents:

print(i)

#%% .children 获取子节点迭代器类型，

# 使用循环的方式才能取出内容

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

Once upon a time there were three little sisters; and their names were

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.children)

for i, child in enumerate(soup.p.children):

print(i, child)

#%% .descendents 获取所有的子孙节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

Once upon a time there were three little sisters; and their names were

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.descendants)

for i, decendant in enumerate(soup.p.descendants):

print(i, decendant) # 输出p标签的所有的子孙节点

#%% .parent父节点 .parents祖先节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

Once upon a time there were three little sisters; and their names were

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.a.parent)

print(type(soup.a.parent)) # 父节点为标签类

print(type(soup.a.parents)) # 祖先节点为迭代器

for i, pars in enumerate(soup.a.parents):

print(i, pars)

print(list(enumerate(soup.a.parents))) # list 输出

#%% 获取兄弟并列的节点

# .next_siblings 下一个兄弟节点

# .previous_siblings 前一个兄弟界节点

html = """

<html>

<head>

<title>The Dormouse's story</title>

</head>

<body>

Once upon a time there were three little sisters; and their names were

<span>Elsie</span>

</a>

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

and they lived at the bottom of a well.

</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.a.next_siblings) # 迭代器的类型的返回

print(list(enumerate(soup.a.next_siblings)))

print(list(enumerate(soup.a.previous_siblings)))

"""

以上的选择方式为表标签的选择方式，方式快，但是满足不够

"""

#%% 标准选择器 find_all 根据标签名，属性，选择标签列表返回

# find_all (name, attrs, recursiv, text, **kwargs)

#%% 标签名选择 name

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all('ul'))

print(type(soup.find_all('ul')))

print(soup.find_all('ul')[0])

#%% 循环嵌套 find_all

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.find_all('ul'):

print(soup.find_all('li')) # 循环嵌套的方式查找 ul 标签中的li标签

#%% attrs 属性查找对应的内容

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(attrs={'id':'list-1'})) # 匹配所有符合该属性的标签内容

print(soup.find_all(attrs={'name':'elements'})) # 两次的匹配结果实际上一致

## 更简单的写法

print(soup.find_all(id='list-1'))

print(soup.find_all(class_= 'list')) # 再这个地方class 为关键字，因此加下划线来进行区分

#%% 使用文本的内容进行匹配 text

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(text='Foo')) # 有两处满足输出文本，不输出完整的标签

#%% find 方法只返回单个匹配的元素，不返回所有的结果

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find('ul')) # 值返回第一个匹配的结果

#%% 其他选择，类似于标签的选择

"""

### find_parents() find_parent()

find_parents()返回所有祖先节点，find_parent()返回直接父节点。

### find_next_siblings() find_next_sibling()

### find_previous_siblings() find_previous_sibling()

find_previous_siblings()返回前面所有兄弟节点，find_previous_sibling()返回前面第一个兄弟节点。

### find_all_next() find_next()

find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点

### find_all_previous() 和 find_previous()

find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

"""

#%% CSS 选择器通过select 直接传入CSS选择器就可以完成标签或者元素的的选择

# . 代表 class

# # 代表 id

# 空格代表嵌套

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.select('.panel .panel-heading')) # . 选择class

print(soup.select('.panel .panel-body'))

print(soup.select('ul li'))

print(type(soup.select('ul li'))) # 嵌套使用空格进行选择选择后的对象为list

print(list(enumerate(soup.select('ul li'))))

print(list(soup.select('ul li')))

print(soup.select('#list-2')) # 通过 # 选择id 选出所有 id 符合条件的标签

print(soup.select('#list-1 .element')) # 通过 # id 选择然后嵌套选出 class 为element的标签

#%% for嵌套选择

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(soup.select('li')) # 通过for循环嵌套选出 li 标签

print(soup.select('ul li')) # 使用空格具有同等效果的嵌套

#%% select 获取标签的属性 [ ]

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(ul['class'])

print(ul.attrs['id']) # 使用 [ ] 的这两种方式都可以获取标签的 id

#%% get_text 获取标签标签中的内容

html='''

<h4>Hello</h4>

</div>

</ul>

</ul>

</div>

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for li in soup.select('li'):

print(li.get_text('.title'))

#%% 示例

import requests

from bs4 import BeautifulSoup

html = requests.get('https://book.douban.com').text

#print(html.text)

soup = BeautifulSoup(html, 'lxml')

# print(soup.prettify())

for title in soup.select('.title '):

#print(soup.select('a'))

for a in soup.select('a'):

print(a['href'])

print(a.get_text())

posted @ 2019-06-10 01:54 binyang 阅读(806) 评论(0) 收藏举报

刷新页面返回顶部

binyang

世界诺大 四海为家

爬虫 基础 BS详解

世界诺大四海为家

爬虫基础 BS详解