Python搜索B站视频并且爬取视频

目前遍历循环仍未完成,所以只会下载第一个结果,后续会完善成接口可以做单独调用,其中还有音频和视频的合并,需要先安装ffmpeg环境

 

2021/1/23

更新日志:

1、完成该页的视频下载

2、部分没有资源的视频做了容错处理

3、添加了合并视频以后去掉了未合成的文件

 

  1 #!/usr/bin/env python
  2 # -*- coding: utf-8 -*-
  3 # @Author  : 黑羽青衣
  4 # @File    : ${NAME}.py
  5 
  6 import requests
  7 from urllib import parse,request
  8 import urllib.request
  9 from bs4 import BeautifulSoup
 10 import re
 11 import os
 12 import subprocess
 13 import time
 14 import json
 15 import sys
 16 import io
 17 import ffmpeg
 18 
 19 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
 20 
 21 
 22 class BiliBili(object):
 23     def __init__(self, url, keyword):
 24         self.url = url
 25         self.keyword = keyword
 26 
 27 
 28     def html(self, url):
 29         headers = {
 30             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43',
 31             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 32             'Accept-Language': 'zh-CN,zh;q=0.8',
 33             'Connection': 'keep-alive',
 34         }
 35         html = requests.get(url, headers=headers)
 36         html = html.text
 37         #print(html)
 38         return html
 39 
 40     def get_video_html(self,url):
 41         headers = {
 42             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43',
 43             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 44             'Accept-Language': 'zh-CN,zh;q=0.8',
 45             'Connection': 'keep-alive',
 46             # 'Cookie':cookie
 47         }
 48         response = requests.get(url, headers=headers)
 49         video_html = response.text
 50         return video_html
 51 
 52     def get_video_info(selfs, html):
 53         #print(html)
 54         result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0]
 55         html_data = json.loads(result)
 56         #print(html_data)
 57         download_video_url = html_data['data']['dash']['video'][0]['backup_url'][0]
 58         return download_video_url
 59 
 60     def get_audio_info(self, html):
 61         result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0]
 62         html_data = json.loads(result)
 63         download_audio_url = html_data['data']['dash']['audio'][0]['backup_url'][0]
 64         return download_audio_url
 65 
 66     def search_video_info(self, html):
 67         soup = BeautifulSoup(html,"html.parser")
 68         video_info = {}
 69         for tag in soup.find_all('div', class_='info'):
 70             title = tag.find('a',class_='title').get_text()
 71             people_num = tag.find('span', class_='so-icon watch-num').get_text()
 72             up_name = tag.find('a', class_='up-name').get_text()
 73             video_url = tag.find('a').get('href')
 74             video_url = video_url.replace('//','')
 75             video_info[title] = video_url
 76         return video_info
 77 
 78     def search_video(self, html):
 79         video_info = self.search_video_info(html)
 80         #print(video_info)
 81         self.run_video(video_info ,url)
 82 
 83     def run_search(self):
 84         #获取搜索结果,根据搜索结果获得视频链接
 85         html =self.html(url)
 86         self.search_video(html)
 87 
 88 
 89     def run_video(self,video_info,url):
 90         # 根据结果传入来获得视频下载链接
 91         video_size = 0
 92         audio_size = 0
 93         for title,video_url in video_info.items():
 94             get_video_html = self.get_video_html('https://' + video_url)
 95             download_video_url = self.get_video_info(get_video_html)
 96             download_audio_url = self.get_audio_info(get_video_html)
 97             #print(title + ":" + video_url + ":" + download_video_url)
 98             headers = {
 99                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
100                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
101                 'Referer': 'https://' + video_url,
102                 'Accept-Encoding': "gzip, deflate, br",
103                 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
104                 'Connection': 'keep-alive',
105                 # 'Cookie':cookie
106             }
107             try:
108                 video_content = requests.get(download_video_url, stream=True, headers=headers)
109                 mp4_file_size = int(video_content.headers['content-length'])
110                 if video_content.status_code == 200:
111                     print('[文件大小]:%0.2f MB' % (mp4_file_size / 1024 / 1024))
112                     with open(title + '.mp4', mode='wb') as mp4:
113                         for chunk in video_content.iter_content(chunk_size=1024):
114                             if chunk:
115                                 mp4.write(chunk)
116                                 video_size += len(chunk)  # 已下载的文件大小
117             except BaseException:
118                 print('凉凉,下载失败')
119                 pass
120             try:
121                 audio_content = requests.get(download_audio_url, stream=True, headers=headers)
122                 mp3_file_size = int(audio_content.headers['content-length'])
123                 if audio_content.status_code == 200:
124                     print('[文件大小]:%0.2f MB' % (mp3_file_size / 1024 / 1024))
125                     with open(title + '.mp3', mode='wb') as mp3:
126                         for chunk in audio_content.iter_content(chunk_size=1024):
127                             if chunk:
128                                 mp3.write(chunk)
129                                 audio_size += len(chunk)
130             except BaseException:
131                 print('凉凉,下载失败')
132                 pass
133 
134             print('正在保存:', title)
135             if os.path.exists(title + '.mp4'):
136                 if self.video_audio_merge_single(title):
137                     continue
138             time.sleep(60)
139 
140 
141 
142 
143 
144     def video_audio_merge_single(self, video_name):
145         #合成视频
146         print('视频合成开始:',video_name)
147         ffm = r"D:\sofware\ffmpeg-4.3.1-2021-01-01-full_build\bin\ffmpeg.exe "
148         command = ffm + ' -i "{}.mp4" -i "{}.mp3" -vcodec copy -acodec copy "{}.mp4"'.format(
149             video_name, video_name, video_name + '(合)')
150         subprocess.Popen(command, shell=True)
151         print(command)
152         time.sleep(10)
153         print("视频合成结束:", video_name)
154         os.remove(f'{video_name}.mp3')
155         os.remove(f'{video_name}.mp4')
156         return True
157 
158 
159 if __name__ =='__main__':
160     url = 'https://search.bilibili.com/all?'
161     keyword = 'Python'
162     keyword = urllib.parse.quote(keyword)
163     param = 'keyword=' + keyword + '&from_source=nav_searchs'
164     url = url + param
165     BB = BiliBili(url, keyword)
166     BB.run_search()

 

posted @ 2021-01-19 21:57  黑羽青衣  阅读(473)  评论(0编辑  收藏  举报