豆瓣爬虫解析——BeautifulSoup+CSS选择器

TagBeautifulSoup 对象的 .select() 方法中传入字符串参数, 即可使用CSS选择器的语法找到tag:

CSS选择器

soup.select("title")
# [<title>The Dormouse's story</title>]
soup.select("p:nth-of-type(3)")
# [<p class="story">...</p>]

通过tag标签逐层查找

soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("html head title")
# [<title>The Dormouse's story</title>]

找到某个tag标签下的直接子标签

soup.select("head > title")
# [<title>The Dormouse's story</title>]

soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("p > a:nth-of-type(2)")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

soup.select("p > #link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("body > a")
# []

找到兄弟节点标签:

soup.select("#link1 ~ .sister")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie"  id="link3">Tillie</a>]

soup.select("#link1 + .sister")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

通过CSS的类名查找:

soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

通过tag的id查找:

soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

同时用多种CSS选择器查询元素:

soup.select("#link1,#link2")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

通过是否存在某个属性来查找:

soup.select('a[href]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

通过属性的值来查找:

soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

通过语言设置来查找:

multilingual_markup = """
 <p lang="en">Hello</p>
 <p lang="en-us">Howdy, y'all</p>
 <p lang="en-gb">Pip-pip, old fruit</p>
 <p lang="fr">Bonjour mes amis</p>
"""
multilingual_soup = BeautifulSoup(multilingual_markup)
multilingual_soup.select('p[lang|=en]')
# [<p lang="en">Hello</p>,
#  <p lang="en-us">Howdy, y'all</p>,
#  <p lang="en-gb">Pip-pip, old fruit</p>]

返回查找到的元素的第一个

soup.select_one(".sister")
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

上一篇的代码,改用css选择器来编写

import requests # 获取页面数据
from bs4 import BeautifulSoup # 解析页面
import re # 正则表达式
resp = requests.get('https://www.bilibili.com/v/popular/rank/all') # 当前网站链接
html_doc = resp.content
page_content = BeautifulSoup(html_doc,'html.parser') # 解析html
print(page_content.prettify()) # 能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出
type(page_content)

bs4.BeautifulSoup

注:CSS选择器只能对bs4.BeautifulSoup类型使用!

# 通过CSS的类名查找
# 标题
title = page_content.select(".title")
title
title_list = []
url_list = []

for t in title:
    # 获得文字内容
    name = t.get_text() 
    # 获得链接
    url = 'http:' + t['href']
    # 添加到列表
    title_list.append(name)
    url_list.append(url)
print("OK!")
# up主
up_name = page_content.select(".up-name")
up_name

upname_list = []
pattern = re.compile(r'\S(.*)') # 第一个非空字符

for up in up_name:
    m = pattern.search(up.get_text())
    upname_list.append(m.group(0))

print("OK!")
bofangliang = page_content.select(".detail-state > span:nth-of-type(1)")
bofangliang

danmuliang = page_content.select(".detail-state > span:nth-of-type(2)")
danmuliang
# 播放量
# 弹幕量

play_list = []
danmu_list = []

for play in bofangliang:
    m = pattern.search(play.get_text())
    play_list.append(m.group(0))
print("OK!")

for danmu in danmuliang:
    m = pattern.search(danmu.get_text())
    danmu_list.append(m.group(0))
print("OK!")
pd.DataFrame({'title':title_list
              ,'url':url_list
              ,'up':upname_list
              ,'播放量':play_list
              ,'弹幕量':danmu_list},).to_csv('00.csv',header=True,index=False,encoding="utf_8_sig")

代码中涉及到正则表达式的使用,该篇不做说明。

posted on 2022-03-19 11:00  cookie的笔记簿  阅读(188)  评论(0编辑  收藏  举报