BiliBili视频信息爬虫
BiliBili视频信息爬虫
代码如下:
import requests
import random
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import pickle
import time
import threading
from fake_useragent import UserAgent
# bilibili视频号转换算法
def Bv2Av(avno):
table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
tr = {}
for i in range(58):
tr[table[i]] = i
s = [11, 10, 3, 8, 4, 6]
xor = 177451812
add = 8728348608
def dec(x):
r = 0
for i in range(6):
r += tr[x[s[i]]] * 58 ** i
return (r - add) ^ xor
def enc(x):
x = (x ^ xor) + add
r = list('BV1 4 1 7 ')
for i in range(6):
r[s[i]] = table[x // 58 ** i % 58]
return ''.join(r)
return enc(avno)
class BLBL(object):
def __init__(self, url, bvn, avn, cookie=None, referer=None):
self.base_url = url
self.api_url = "http://api.bilibili.com/archive_stat/stat?aid="
self.cookie = cookie
self.referer = referer
self.avn = str(avn)
self.bvn = bvn
self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
self.accept_Encoding = 'gzip, deflate, br'
self.accept_Language = 'zh-CN,zh;q=0.9,en;q=0.8'
self.user_agent = str(UserAgent(use_cache_server=False).random) # "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "
def html(self, url):
base_headers = {
'Accept': self.accept,
'Accept-Encoding': self.accept_Encoding,
'Accept-Language': self.accept_Language,
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': self.cookie,
'Host': 'www.bilibili.com',
'Referer': self.referer,
'Upgrade-Insecure-Requests': '1',
'User-Agent': self.user_agent
}
# 请求网页
base_response = requests.get(url, headers=base_headers, timeout=3000)
# 获取网页html代码
html = base_response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def run(self):
html = self.html(self.base_url + self.bvn)
otherinf = requests.get(self.api_url + self.avn, timeout=2000)
j = json.loads(otherinf.text)
# print(otherinf.text[otherinf.text.index("data")-1:-1])
if j.get('message', 0) == 'archive state is incorrect':
return None
tag = []
try:
times = html.find(name="span", attrs={'class': 'a-crumbs'}).next_sibling.text
title = html.find("span").text
# print(title)
for i in html.find(name="div", attrs={"class": "video-data"}).find_all(name="a"):
tag.append(i.text)
data = j['data']
j.pop('data')
j.update(data)
j['times'] = times
j['tags'] = tag
allinf = {title: j}
print(allinf)
time.sleep(random.randint(3, 6))
return allinf
except Exception as e:
print(e)
if __name__ == '__main__':
url = "https://www.bilibili.com/video/"
vinf = {}
for i in range(10000001, 10010000):
blbl = BLBL(url, Bv2Av(i), i)
tmp = blbl.run()
if tmp != None:
try:
vinf.update(tmp)
except Exception as e:
print(e)
# with open('./final.pkl','wb+') as fout:
# pickle.dump(vinf, fout)
if "final.csv" not in os.listdir(os.getcwd()):
pd.DataFrame(vinf).T.to_csv('./final.csv', encoding="utf_8_sig")