三体数据爬取
《三体》动漫点评区数据爬取
预备
## 准备的库
import pandas as pd # 数据分析库
import requests # 用于发送 HTTP 请求
import json # json 格式
import numpy as np
短评数据爬取
## 标头
headers = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
## 访问地址
url0 = 'https://api.bilibili.com/pgc/review/short/list?media_id=4315402&ps=20&sort=0'
score_dp = [] # 存放短评评分
r = requests.get(url0,headers)
data_json = json.loads(r.text)
for i in range(20):
score_dp.append(data_json['data']['list'][i]['score'])
next = data_json['data']['next']
while next:
url = 'https://api.bilibili.com/pgc/review/short/list?media_id=4315402&ps=20&sort=0'+'&cursor='+str(next)
rr = requests.get(url, headers)
data_json_n = json.loads(rr.text)
n = len(data_json_n['data']['list'])
for i in range(n):
score_dp.append(data_json_n['data']['list'][i]['score'])
next = data_json_n['data']['next']
np.mean(score_dp)
长评数据爬取
headers_cp = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
url_cp = 'https://api.bilibili.com/pgc/review/long/list?media_id=4315402&ps=20&sort=0'
score_cp = [] # 存放长评评分
r = requests.get(url_cp,headers_cp)
data_json = json.loads(r.text)
for i in range(len(data_json['data']['list'])):
score_cp.append(data_json['data']['list'][i]['score'])
next = data_json['data']['next']
while next:
url = 'https://api.bilibili.com/pgc/review/long/list?media_id=4315402&ps=20&sort=0'+'&cursor='+str(next)
rr = requests.get(url, headers_cp)
data_json_n = json.loads(rr.text)
n = len(data_json_n['data']['list'])
for i in range(n):
score_cp.append(data_json_n['data']['list'][i]['score'])
next = data_json_n['data']['next']
np.mean(score_cp)
综合评分
np.mean(score_dp+score_cp)
结论:
- 短评平均评分 75
- 长评平均评分 41
- 综合平均评分 63