视频地址 https://www.bilibili.com/bangumi/play/ss39462?spm_id_from=333.851.b_62696c695f7265706f72745f616e696d65.52 弹幕地址 固定的url地址 + 视频的cid+.xml -- 源码搜索cid 比如:https://comment.bilibili.com/428471132.xml
数据获取部分
# 完整代码 # 获取数据 import requests from bs4 import BeautifulSoup import pandas as pd cid = 428471132 url = "https://comment.bilibili.com/{}.xml".format(cid) response = requests.get(url) response.encoding = "utf-8" #print(response.text) # 解析数据 soup = BeautifulSoup(response.text,"lxml") datas = soup.select('d') #print(datas[0]) # 获取弹幕文字内容 comments = [data.text for data in datas] #print(comments) # 属性信息 # 出现时间点 模式 字体 颜色 发送时间 弹幕词 用户ID rowID 等 info_comments = [data.get('p').split(',') for data in datas] # 获取弹幕属性信息 #print(info_comments) # 数据存储 columns = ["出现时间点","模式","字体","颜色","发送时间","弹幕池","用户ID","rowID","未知参数"] comment_datas = pd.DataFrame(info_comments,columns=columns) #print(comment_datas) # 数据组合 comment_datas["comments"] = comments #print(comment_datas) # 数据存储 comment_datas.to_csv("comments.csv",encoding="utf-8-sig") print("finish...")
数据分析部分
一 绘制词云图
# 加载数据
import pandas as pd
comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
print(comment_datas)
## 绘制词云图
import jieba from tkinter import _flatten import matplotlib.pyplot as plt from wordcloud import WordCloud # 数据获取 comments = comment_datas["comments"] # 分词 jieba.load_userdict("hong.txt") # 加载用户自定义词典 comments_cut = comments.apply(jieba.lcut) # 对弹幕进行分词 #print(comments_cut) # 去除停用词 with open("stoplist.txt","r",encoding="utf-8") as f: stop_words = f.read() stop_words += "\n" stop_words += "●" comments_after = comments_cut.apply(lambda x:[i for i in x if i not in stop_words]) #print(comments_after) # 词频统计 results = _flatten(list(comments_after)) #print(results) word_count=pd.Series(results).value_counts() #print(word_count) # 绘制词云 https://tool.lu/cutout/ pic = plt.imread("aixin.jpg") # 读取一张词云轮廓 word_cloud = WordCloud(mask=pic,background_color='white',font_path="C:\Windows\Fonts\simhei.ttf") word_cloud.fit_words(word_count) plt.imshow(word_cloud) plt.axis('off')
二 分析弹幕数量与日期,时间的关系
# 分析弹幕数量与日期,时间的关系 # 加载数据 import pandas as pd from datetime import datetime comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig") comment_datas["发送时间"] = comment_datas["发送时间"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) #print(comment_datas) # 分析弹幕数量与日期,时间的关系 userID = comment_datas["用户ID"] #print(userID) # 每个用户发送多少次弹幕 userID_count = comment_datas["用户ID"].value_counts() #print(userID_count) # 求取发送次数弹幕的用户量 userID_count_count = comment_datas["用户ID"].value_counts().value_counts() #print(userID_count_count) # 排序依据大小排列 userID_count_count_sort = comment_datas["用户ID"].value_counts().value_counts().sort_index() print(userID_count_count_sort) #num = userID_count_count_sort[:6] num = userID_count_count_sort[6:] #num.append(userID_count_count_sort[6:].sum()) print(num.sum()) ## 绘制条形图 import matplotlib.pyplot as plt num = userID_count_count_sort[:6] plt.style.use('ggplot') plt.rcParams['font.sans-serif'] = 'SimHei' plt.bar(range(6),num) plt.xlabel("弹幕数量") plt.ylabel("用户数量") plt.title("弹幕发布数量分布图") plt.show() ## 弹幕数量随时间变化图 # 去除时分秒的影响 dates = pd.to_datetime(comment_datas["发送时间"]) dates = [date.date() for date in dates] dates = pd.Series(dates) num = dates.value_counts().sort_index() #print(date_counts) # 绘制折线图 plt.figure(figsize=(16,9)) plt.plot(range(len(num)),num) #plt.xticks(range(len(num))[::7],num.index[::7],rotation=45) plt.xticks(range(len(num)),num.index,rotation=45) plt.ylabel("弹幕数量") plt.xlabel("日期变化") plt.title("弹幕发布数量随日期变化图") plt.show() ### 分析弹幕数量与日期,时间的关系 -- 以周为研究对象 import pandas as pd comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig") #comment_datas["发送时间"] comment_datas["发送时间"] = comment_datas["发送时间"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) dates = pd.to_datetime(comment_datas["发送时间"]) #print(dates) date = pd.Series(dates.dt.weekday) #print(date) date_count = date.value_counts().sort_index() #print(date_count) plt.figure(figsize=(16,9)) plt.plot(range(len(date_count)),date_count) plt.xticks(range(len(date_count)),["周日","周一","周二","周三","周四","周五","周六"],rotation=45) plt.ylabel("弹幕数量") plt.xlabel("日期变化") plt.title("弹幕发布数量随日期变化图") plt.show()