利用python对《乘风破浪的姐姐》可视化
要说今年最火的综艺节目,兴风作浪……,哦,不对;小明历险记……,也不对,哎!算了,接着看吧……
数据
利用爬虫抓取了百度百科和维基百科上姐姐们的公开数据信息。两个网站均为静态的页面,只需要对相应HTML标签进行解析即可获取相应的数据。(更简单的方法也可以直接复制相应的表格信息到本地)
百度百科
PS:仔细一瞧,百度百科上有些信息竟然是错的,比如陈松伶的初舞台评分。
维基百科
维基百科数据可以跟百度百科互补一下,对于评分细节更细致许多。
视频弹幕
看着视频,按F12打开开发者工具,然后在network下就会有弹幕json数据包不断的向你扔过来了,在Headers栏可以查看json的路径。
https://bullet-ali.hitv.com/bullet/2020/06/30/224134/8766480/0.json
可以看出有日期,即爬取当天的弹幕数据,其余两个应该是节目id和视频id,x.json应该是按照节目的时间来定的,每60s一个数据包。
scrapy startproject sisiter
cd sisiter
scrapy genspider danmu bullet-ali.hitv.com
构建数据Item
import scrapy
class DanmuItem(scrapy.Item):
# define the fields for your item here like:
# 视频id
vid_id = scrapy.Field()
# id
danmu_id = scrapy.Field()
# uname
uname = scrapy.Field()
# 内容
content = scrapy.Field()
# 时间
danmu_time = scrapy.Field()
# 点赞
up_count = scrapy.Field()
# 分钟
danmu_minites = scrapy.Field()
编写爬虫解析代码
# -*- coding: utf-8 -*-
import scrapy
import json
from datetime import datetime
from sister.items import DanmuItem
class DanmuSpider(scrapy.Spider):
name = 'danmu'
# allowed_domains = ['bullet-ws.hitv.com']
start_urls = ['https://bullet-ali.hitv.com']
date_str = datetime.now().strftime('%Y/%m/%d')[:8] + str(datetime.now().day)# 当前时日期
num1 = 112744 # 节目参数 7/3最新一期
num2 = 8980904 # 视频参数
def start_requests(self):
base_url = 'https://bullet-ali.hitv.com/bullet/%s/%d/%d/%d.json'
for page in range(38):
# 打印进度
print('正在获取第{}页的信息'.format(page))
url = base_url % (self.date_str, self.num1, self.num2, page)
yield scrapy.Request(url=url, meta={'step': page}, callback=self.parse)
def parse(self, response):
step = response.meta['step']
json_data = json.loads(response.body)
# 获取数据
all_data = json_data['data']['items']
print(len(all_data))
for data in all_data:
danmu_item = DanmuItem()
danmu_item['vid_id'] = self.num2
danmu_item['danmu_id'] = data.get('id')
danmu_item['uname'] = data.get('uname')
danmu_item['content'] =data.get('content')
danmu_item['danmu_time'] = data.get('time')
danmu_item['up_count'] = data.get('v2_up_count')
danmu_item['danmu_minites'] = step + 1
yield danmu_item
保存数据pipeline
import pandas as pd
class SisterPipeline(object):
def __init__(self):
self.info_list = []
def process_item(self, item, spider):
self.info_list.append(item)
return item
def close_spider(self, spider):
df = pd.DataFrame(self.info_list)
df.to_csv('danmu_info.csv', sep='\t', encoding='utf-8', index=False)
保存的数据就是这样婶儿的
可视化
整体
首先,看看姐姐们都来自哪里,可以看出节目组在姐姐们的选择上兼顾到了两岸三地,
而且在民族构成上,也包含了7位少数民族选手,比如“土家族之花”--沈梦辰。
年龄分布情况
职业分布情况
年龄与初评得分的关系很明显了🙂
弹幕热度排行
弹幕词云,看来大家都 喜欢 姐姐
基于这些可视化结果,制作了一个简易的统计大屏进行展示:
每位浪姐
代码
import os
import jieba
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from pyecharts.charts import Page, Sankey, WordCloud, Radar
from pyecharts.components import Image
from pyecharts.options import ComponentTitleOpts
from collections import Counter
from pyecharts.globals import SymbolType
from pyecharts import options as opts
from pyecharts.options.global_options import ThemeType
from pyecharts import options as opts
from collections import Counter
import random
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
def get_cut_words(content_series):
# 读入停用词表
import jieba
stop_words = []
with open("data/stopwords.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加关键词
my_words = ['杜华', '辣鸡', '导演组', '节目组', '不公平', '黄圣依', '无杜华版']
for i in my_words:
jieba.add_word(i)
my_words2 = my_words_list
for j in my_words2:
jieba.add_word(j)
# 自定义停用词
my_stop_words = ['第一期', '一堆', '三个', '真的', '哈哈哈', '哈哈哈哈', '啊啊啊']
stop_words.extend(my_stop_words)
# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i) >= 2]
return word_num_selected
def show_all():
page = Page()
page.add(
image1,
wc,
radar
)
out_html = 'data/sister/%s.html' % name
page.render(out_html)
# 修改样式
with open(os.path.join(os.path.abspath("."), out_html), 'r+', encoding="utf8") as html:
html_bf = BeautifulSoup(html, "lxml")
divs = html_bf.find_all("div")
print(len(divs))
divs[0][
"style"] = "align=\"center\";margin:0 auto;text-align:center;"
divs[1][
"style"] = "width:550px;height:350px;position:absolute;top:120px;left:700px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
divs[2][
"style"] = "width:700px;height:700px;position:absolute;top:120px;left:20px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
divs[3][
"style"] = "width:600px;height:400px;position:absolute;top:300px;left:1250px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
# 修改页面背景色
body = html_bf.find("body")
body["style"] = "background-color:#333333;"
# 追加标题
div_title = "<div align=\"center\" style=\"width:1840px;\">\n<span style=\"font-size:32px;font face=\'黑体\';color:#FFFFFF\"><b> </b></div>"
body.insert(0, BeautifulSoup(div_title, "lxml").div)
html_new = str(html_bf)
html.seek(0, 0)
html.truncate()
html.write(html_new)
html.close()
df = pd.read_csv('data/sister_data.csv', encoding='utf-8', sep='\t')
# 弹幕数据
df_all = pd.DataFrame()
for i in range(1, 9):
tmp = pd.read_csv('sister/sister/danmu_info_%d.csv' % i, encoding='utf-8', sep='\t')
df_all = df_all.append(tmp)
# print(df['names'].tolist())
df.sort_values('总分', ascending=False, inplace=True)
# 昵称
df['昵称'] = ['蓝盈莹|盈莹', '黄龄', '丹妮', '孟佳', '梦辰',
'可唯', '宁静|静静子|静姐', '霏霏', '希怡', '袁咏琳',
'圣依|依依子', '金晨', '阿朵', '含韵', '白冰',
'钟丽缇', '茜|茜茜子', '张萌|萌萌子', '婧汐', '丁当',
'许飞', '刘芸|芸芸子', '吴昕|昕昕子|昕姐|昕昕', '伊能静', '松伶',
'丽坤', '张雨绮|雨绮|绮绮子', '海陆', '金莎', '王智']
print(df.head(5))
print(df.columns)
for name in df.names.tolist():
image1 = Image()
img_src = (
"../img/%s.jpg" % name # html 路径问题
)
image1.add(
src=img_src,
style_opts={"width": "345px", "height": "584px", "style": "margin-top: 15px"},
)
image1.set_global_opts(
title_opts=ComponentTitleOpts(
title_style={"style": "color: white; font-size: 18px; font-weight:bold;"},
subtitle_style={"style": "color: white;font-size: 12px;"})
)
# 雷达图
value = df[["个人特质", "声乐表现力", "成团潜力", "舞台表现力"]][df.names == name].values[0]
data = [{"value": [float(i) for i in value], "name": "分数"}]
c_schema = [
{"name": "个人特质", "max": 25, "min": 0},
{"name": "声乐表现力", "max": 25, "min": 0},
{"name": "成团潜力", "max": 25, "min": 0},
{"name": "舞台表现力", "max": 25, "min": 0},
]
radar = (
Radar()
.set_colors(["#4587E7"])
.add_schema(
schema=c_schema,
shape="circle",
center=["50%", "50%"],
radius="80%",
angleaxis_opts=opts.AngleAxisOpts(
min_=0,
max_=360,
is_clockwise=False,
interval=5,
axistick_opts=opts.AxisTickOpts(is_show=False),
axislabel_opts=opts.LabelOpts(is_show=False),
axisline_opts=opts.AxisLineOpts(is_show=False),
splitline_opts=opts.SplitLineOpts(is_show=False),
),
radiusaxis_opts=opts.RadiusAxisOpts(
min_=0,
max_=25,
interval=5,
splitarea_opts=opts.SplitAreaOpts(
is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
),
),
polar_opts=opts.PolarOpts(),
splitarea_opt=opts.SplitAreaOpts(is_show=False),
splitline_opt=opts.SplitLineOpts(is_show=False),
)
.add(
series_name="分数",
data=data,
color="#f9713c",
areastyle_opts=opts.AreaStyleOpts(opacity=0.1),
linestyle_opts=opts.LineStyleOpts(width=1),
)
)
# 弹幕词云
tmp = df[df.names == name]
my_words_list = df.昵称.str.cat(sep='。').replace('|', '。').split('。')
print(tmp.昵称.values[0])
text1 = get_cut_words(content_series=df_all.content[df_all.content.str.contains(tmp.昵称.values[0])])
wordCount_dict = Counter(text1)
choices_number = 200
count_list = sorted(wordCount_dict.items(), key=lambda x:x[1],reverse=True)
count_list = count_list[:choices_number]
keyword_list = [k[0] for k in count_list]
value_list = [k[1] for k in count_list]
wc = (
WordCloud()
.add(series_name="弹幕词云", data_pair=count_list, word_size_range=[20, 100],
textstyle_opts=opts.TextStyleOpts(font_family="cursive"),shape=SymbolType.DIAMOND)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
show_all()
- 参考链接: