Python爬取b站视频

import json
import os
import subprocess
import time

import requests
import re



class BLBL(object):
    def __init__(self, url, cookie, referer):
        self.base_url = url
        # cookie内容
        self.cookie = cookie
        # referer内容
        self.referer = referer
        # 请求头信息
        self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
        self.accept_Encoding = 'gzip, deflate, br'
        self.accept_Language = 'zh-CN,zh;q=0.9,en;q=0.8'
        self.user_agent = "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "

    def html(self):
        # 访问起始网页需添加的请求头,不加的话,得不到完整的源代码(反爬)
        base_headers = {
            'Accept': self.accept,
            'Accept-Encoding': self.accept_Encoding,
            'Accept-Language': self.accept_Language,
            'Cache-Control': 'no-cache',
            'Cookie': self.cookie,
            'Referer': self.referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent
        }
        # 请求网页
        base_response = requests.get(self.base_url, headers=base_headers)
        # 获取网页html代码
        html = base_response.text
        # print(html.headers)
        return html

    def xin_xi(self, html):
        result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0]
        print(result)
        title = re.findall('<span class="tit">(.*?)</span>', html)[0].replace('/','').replace(':','').replace(' ','').strip()
        html_data = json.loads(result)
        # 音频url地址
        audio_url = html_data['data']['dash']['audio'][0]['backupUrl'][0]
        # 视频url地址
        video_url = html_data['data']['dash']['video'][0]['backupUrl'][0]
        return title, audio_url, video_url

    def video(self, html):
        # 获取视频名称,音频网址,视频网址
        title, audio_url, video_url = self.xin_xi(html)
        # 请求视频下载地址时需要添加的请求头
        download_headers = {
            'User-Agent': self.user_agent,
            'Referer': self.referer,
            'Orig`in': 'https://www.bilibili.com',
            'Accept': self.accept,
            'Accept-Encoding': self.accept_Encoding,
            'Accept-Language': self.accept_Language
        }
        audio_content = requests.get(audio_url,headers=download_headers).content
        video_content = requests.get(video_url,headers=download_headers).content
        with open(title + '.mp3', mode='wb') as f:
            f.write(audio_content)
        with open(title + '.mp4', mode='wb') as f:
            f.write(video_content)
        print('正在保存:', title)
        self.video_audio_merge_single(title)
    def run(self):
        html = self.html()
        self.video(html)
        print('爬取成功')# 爬下来的是两个文件 一个音频一个视频 需要合成到一块才是完整的(使用ffmpeg)提前下载安装好并配置好环境变量

    def video_audio_merge_single(self,video_name):
        print("视频合成开始:", video_name)
        #  ffmpeg -i video.mp4 -i audio.wav -c:v copy -c:a aac -strict experimental output.mp4
        command = 'ffmpeg -i {}.mp4 -i {}.mp3 -vcodec copy -acodec copy {}.mp4'.format(
            video_name, video_name,video_name+'(合)')
        subprocess.Popen(command, shell=True)
        time.sleep(10)
        print("视频合成结束:", video_name)
        os.remove(f'{video_name}.mp3')
        os.remove(f'{video_name}.mp4')

if __name__ == '__main__':
    url= 视频播放地址  如:'https://www.bilibili.com/video/BV1yy4y1i766'
    referer = 'https://space.bilibili.com/'
    cookie = 登录后的cookie
    blbl = BLBL(url, cookie, referer)
    blbl.run()

 

posted @ 2020-12-02 11:15  时光哥哥  阅读(839)  评论(0编辑  收藏  举报