BiliBili视频信息爬虫

BiliBili视频信息爬虫

代码如下:

import requests
import random
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import pickle
import time
import threading
from fake_useragent import UserAgent


# bilibili视频号转换算法
def Bv2Av(avno):
    table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
    tr = {}
    for i in range(58):
        tr[table[i]] = i
    s = [11, 10, 3, 8, 4, 6]
    xor = 177451812
    add = 8728348608

    def dec(x):
        r = 0
        for i in range(6):
            r += tr[x[s[i]]] * 58 ** i
        return (r - add) ^ xor

    def enc(x):
        x = (x ^ xor) + add
        r = list('BV1  4 1 7  ')
        for i in range(6):
            r[s[i]] = table[x // 58 ** i % 58]
        return ''.join(r)

    return enc(avno)


class BLBL(object):
    def __init__(self, url, bvn, avn, cookie=None, referer=None):
        self.base_url = url
        self.api_url = "http://api.bilibili.com/archive_stat/stat?aid="
        self.cookie = cookie
        self.referer = referer
        self.avn = str(avn)
        self.bvn = bvn
        self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
        self.accept_Encoding = 'gzip, deflate, br'
        self.accept_Language = 'zh-CN,zh;q=0.9,en;q=0.8'
        self.user_agent = str(UserAgent(use_cache_server=False).random)  # "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "

    def html(self, url):
        base_headers = {
            'Accept': self.accept,
            'Accept-Encoding': self.accept_Encoding,
            'Accept-Language': self.accept_Language,
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': self.cookie,
            'Host': 'www.bilibili.com',
            'Referer': self.referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent
        }
        # 请求网页
        base_response = requests.get(url, headers=base_headers, timeout=3000)
        # 获取网页html代码
        html = base_response.text
        soup = BeautifulSoup(html, "html.parser")
        return soup

    def run(self):
        html = self.html(self.base_url + self.bvn)
        otherinf = requests.get(self.api_url + self.avn, timeout=2000)
        j = json.loads(otherinf.text)
        # print(otherinf.text[otherinf.text.index("data")-1:-1])
        if j.get('message', 0) == 'archive state is incorrect':
            return None
        tag = []
        try:
            times = html.find(name="span", attrs={'class': 'a-crumbs'}).next_sibling.text
            title = html.find("span").text
            # print(title)
            for i in html.find(name="div", attrs={"class": "video-data"}).find_all(name="a"):
                tag.append(i.text)
            data = j['data']
            j.pop('data')
            j.update(data)
            j['times'] = times
            j['tags'] = tag
            allinf = {title: j}
            print(allinf)
            time.sleep(random.randint(3, 6))
            return allinf
        except Exception as e:
            print(e)


if __name__ == '__main__':
    url = "https://www.bilibili.com/video/"
    vinf = {}
    for i in range(10000001, 10010000):
        blbl = BLBL(url, Bv2Av(i), i)
        tmp = blbl.run()
        if tmp != None:
            try:
                vinf.update(tmp)
            except Exception as e:
                print(e)
    # with open('./final.pkl','wb+') as fout:
    #   pickle.dump(vinf, fout)
    if "final.csv" not in os.listdir(os.getcwd()):
        pd.DataFrame(vinf).T.to_csv('./final.csv', encoding="utf_8_sig")

后续可视化

https://www.cnblogs.com/Do-n/p/13385991.html

posted @ 2020-07-27 15:53  -拂石-  阅读(468)  评论(0)    收藏  举报