爬取哔哩哔哩网站弹幕
爬取哔哩哔哩网站弹幕
打开哔哩哔哩官网 https://www.bilibili.com/
首先获取它的cid
打开想爬取弹幕视频,然后查看源代码
然后获取用户代理信息
然后去爬取
"""
pip install pandas
pip install bs4
pip install requests
pip install lxml
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
response = requests.get(url=url, headers=header) # 向对方服务器发送请求
response.encoding = response.apparent_encoding # 设置字符编码
data = response.text # 获取文本
soup = BeautifulSoup(data, 'lxml') # pip install lxml 解析
d_list = soup.find_all('d') # 获取所有的d标签
dlst = []
for d in d_list: # 循环拿出所有的d标签
danmu = {}
danmu['弹幕'] = d.text # 获取文本信息
danmu['时间'] = datetime.datetime.now()
danmu['路径'] = url
dlst.append(danmu)
df = pd.DataFrame(dlst) # 转换成二维数组,类似于execl表格
f = open('sign.txt', 'w', encoding='utf-8') # 打开文件
for i in (df['弹幕']).values: # 循环所有的文本信息
pat = re.compile(r'[一-龥]+') # 定义过滤数据的规则,所有的汉字
filter_data = re.findall(pattern=pat, string=i) # 执行过滤操作
f.write("".join(filter_data)) # 写入文本
f.close()
上面的操作会把爬取到的弹幕保存在本地一个名字为sign.txt的文件
然后将上面保存的txt文件以词云形式展示
"""
pip install pandas
pip install bs4
pip install requests
pip install lxml
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re
import jieba
from wordcloud import WordCloud
from matplotlib.image import imread
import matplotlib.pyplot as plt
import numpy as np
# header = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
# }
# response = requests.get(url=url, headers=header) # 向对方服务器发送请求
# response.encoding = response.apparent_encoding # 设置字符编码
# data = response.text # 获取文本
# soup = BeautifulSoup(data, 'lxml') # pip install lxml 解析
# d_list = soup.find_all('d') # 获取所有的d标签
#
# dlst = []
# for d in d_list: # 循环拿出所有的d标签
# danmu = {}
# danmu['弹幕'] = d.text # 获取文本信息
# danmu['时间'] = datetime.datetime.now()
# danmu['路径'] = url
# dlst.append(danmu)
# df = pd.DataFrame(dlst) # 转换成二维数组,类似于execl表格
# f = open('sign.txt', 'w', encoding='utf-8') # 打开文件
# for i in (df['弹幕']).values: # 循环所有的文本信息
# pat = re.compile(r'[一-龥]+') # 定义过滤数据的规则,所有的汉字
# filter_data = re.findall(pattern=pat, string=i) # 执行过滤操作
# f.write("".join(filter_data)) # 写入文本
# f.close()
f = open('sign.txt', 'r', encoding='utf8')
data = f.read()
result = " ".join(jieba.lcut(data))
f.close()
color_mask = imread('demo=jpg.jpg')
wc = WordCloud(
font_path=r'C:\Windows\Fonts\STXINGKA.TTF',
width=1000,
height=800,
mask=color_mask
)
wc.generate(result)
wc.to_file('bili.jpg')
plt.imread(wc)
plt.show()
得到的效果为