豆瓣爬虫解析——BeautifulSoup+CSS选择器
在 Tag
或 BeautifulSoup
对象的 .select()
方法中传入字符串参数, 即可使用CSS选择器的语法找到tag:
CSS选择器
soup.select("title")
# [<title>The Dormouse's story</title>]
soup.select("p:nth-of-type(3)")
# [<p class="story">...</p>]
通过tag标签逐层查找
soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("html head title")
# [<title>The Dormouse's story</title>]
找到某个tag标签下的直接子标签
soup.select("head > title")
# [<title>The Dormouse's story</title>]
soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("p > a:nth-of-type(2)")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup.select("p > #link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("body > a")
# []
找到兄弟节点标签:
soup.select("#link1 ~ .sister")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("#link1 + .sister")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
通过CSS的类名查找:
soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
通过tag的id查找:
soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
同时用多种CSS选择器查询元素:
soup.select("#link1,#link2")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
通过是否存在某个属性来查找:
soup.select('a[href]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
通过属性的值来查找:
soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
通过语言设置来查找:
multilingual_markup = """
<p lang="en">Hello</p>
<p lang="en-us">Howdy, y'all</p>
<p lang="en-gb">Pip-pip, old fruit</p>
<p lang="fr">Bonjour mes amis</p>
"""
multilingual_soup = BeautifulSoup(multilingual_markup)
multilingual_soup.select('p[lang|=en]')
# [<p lang="en">Hello</p>,
# <p lang="en-us">Howdy, y'all</p>,
# <p lang="en-gb">Pip-pip, old fruit</p>]
返回查找到的元素的第一个
soup.select_one(".sister")
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
上一篇的代码,改用css选择器来编写
import requests # 获取页面数据
from bs4 import BeautifulSoup # 解析页面
import re # 正则表达式
resp = requests.get('https://www.bilibili.com/v/popular/rank/all') # 当前网站链接
html_doc = resp.content
page_content = BeautifulSoup(html_doc,'html.parser') # 解析html
print(page_content.prettify()) # 能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出
type(page_content)
bs4.BeautifulSoup
注:CSS选择器只能对bs4.BeautifulSoup类型使用!
# 通过CSS的类名查找
# 标题
title = page_content.select(".title")
title
title_list = []
url_list = []
for t in title:
# 获得文字内容
name = t.get_text()
# 获得链接
url = 'http:' + t['href']
# 添加到列表
title_list.append(name)
url_list.append(url)
print("OK!")
# up主
up_name = page_content.select(".up-name")
up_name
upname_list = []
pattern = re.compile(r'\S(.*)') # 第一个非空字符
for up in up_name:
m = pattern.search(up.get_text())
upname_list.append(m.group(0))
print("OK!")
bofangliang = page_content.select(".detail-state > span:nth-of-type(1)")
bofangliang
danmuliang = page_content.select(".detail-state > span:nth-of-type(2)")
danmuliang
# 播放量
# 弹幕量
play_list = []
danmu_list = []
for play in bofangliang:
m = pattern.search(play.get_text())
play_list.append(m.group(0))
print("OK!")
for danmu in danmuliang:
m = pattern.search(danmu.get_text())
danmu_list.append(m.group(0))
print("OK!")
pd.DataFrame({'title':title_list
,'url':url_list
,'up':upname_list
,'播放量':play_list
,'弹幕量':danmu_list},).to_csv('00.csv',header=True,index=False,encoding="utf_8_sig")
代码中涉及到正则表达式的使用,该篇不做说明。