爬取虎牙小姐姐热舞 - Python
最近疫情在家无聊得一批。爬点虎牙小姐姐的视频养养眼(前面其实有爬过,只是那时候比较懒,用的顺序爬取。)用的还有就是,自己也在做视频剪辑,将视频爬下做素材练练手。爬虫思路一般是确定数据源,然后对数据源进行分析,撸码保存。申明:本文仅做学习交流使用,如有侵权,请联系我删除。
第一种方式,还是用函数写:
1 import os.path 2 import re 3 import concurrent.futures 4 import requests 5 6 def change_title(orginal_title): 7 # 一个更改标题的函数 8 pattern = re.compile(r'[\\\/\:\*\?\"\<\>\|]') 9 new_title = re.sub(pattern, '-', orginal_title) 10 return new_title 11 12 def get_response(page_url): 13 headers = { 14 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36', 15 } 16 # 提取html内容的 17 response = requests.get(url=page_url, headers=headers) 18 response.raise_for_status() 19 response.encoding = response.apparent_encoding 20 response.encoding = 'utf-8' 21 return response 22 23 def save_video(videoRealUrl, videoName, videoQuality): 24 # 先创建文件夹 25 filePath = './虎牙video/' # 在当前目录下创建一个目录 26 if not os.path.exists(filePath): 27 os.mkdir(filePath) 28 # 保存视频的,保存的视频包括标题, 29 videoContent = get_response(page_url=videoRealUrl).content 30 31 try: 32 with open(filePath + videoName + ' - ' + videoQuality + ' - ' + '.mp4', mode='wb') as f: 33 f.write(videoContent) 34 print(f'正在保存---------------------->{videoName},请稍等!') 35 except: 36 print('有异常,请检查!') 37 38 def main(page_url): 39 # 主函数 40 # 开始解析列表页 41 video_category_data = get_response(page_url=page_url).text 42 videoIds = re.findall('//v.huya.com/play/(\d+).html', video_category_data) 43 for videoId in videoIds: 44 # 开始拼接视频播放的请求url 45 videoRequestUrl = f'https://liveapi.huya.com/moment/getMomentContent?&videoId={videoId}&_=1649038571444' 46 # 获取json数据 47 video_json_data = get_response(page_url=videoRequestUrl).json() 48 # 通过json数据解析想要的数据 49 videoName = video_json_data['data']['moment']['title'] 50 videoName = change_title(videoName) # 以防万一,更改下标题 51 videoRealUrl = video_json_data['data']['moment']['videoInfo']['definitions'][0]['url'] 52 videoQuality = video_json_data['data']['moment']['videoInfo']['definitions'][0]['defName'] 53 save_video(videoRealUrl, videoName, videoQuality) 54 if __name__ == "__main__": 55 app = concurrent.futures.ThreadPoolExecutor(max_workers=10) 56 for page in range(1, 11): 57 print(f'--------------------------------------开始采集第{page}页数据!--------------------------------------') 58 url = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}' 59 app.submit(main, url) 60 app.shutdown()
第二种方式是面向对象的方式:
1 import re 2 import os 3 import requests 4 import json 5 6 def change_title(orginal_title): 7 # 一个更改标题的函数 8 pattern = re.compile(r'[\\\/\:\*\?\"\<\>\|]') 9 new_title = re.sub(pattern, '-', orginal_title) 10 return new_title 11 12 def video_stored_path(): 13 # 一个创建文件夹的函数 14 filePath = './虎牙video/' 15 if not os.path.exists(filePath): 16 os.mkdir(filePath) 17 return filePath 18 class huyaVideoSpider(): 19 headers = { 20 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36', 21 } 22 23 def __init__(self, url, headers=headers): 24 self.url = url 25 self.headers = headers 26 27 def get_response(self, page_url): 28 # 获取网页的返回信息 29 response = requests.get(url=page_url) 30 response.raise_for_status() 31 response.encoding = response.apparent_encoding 32 response.encoding = 'utf-8' 33 return response 34 35 def save_video(self, videoContentUrl, videoTitle, videoQuality): 36 # 获取路径 37 filePath = video_stored_path() 38 # 保存视频的方法 39 videoContent = self.get_response(videoContentUrl).content 40 41 try: 42 with open(filePath + videoTitle + ' - ' + videoQuality + '.mp4', mode='wb') as f: 43 f.write(videoContent) 44 print(f'正在保存----------------------->{videoTitle},请稍等!') 45 except: 46 print('有异常,请检查!') 47 48 def parse_category_page(self, category_page_url): 49 # 解析目录页 50 video_category_data = self.get_response(page_url=self.url).text 51 # 提取视频id 52 video_ids = re.findall('//v.huya.com/play/(\d+).html', video_category_data) 53 # 开始拆分id拼接视频请求地址 54 for video_id in video_ids: 55 video_request_url = f'https://liveapi.huya.com/moment/getMomentContent?&videoId={video_id}&_=1649038571444' 56 # 开始解析json数据 57 json_data = json.loads(self.get_response(page_url=video_request_url).text) 58 videoTitle = json_data['data']['moment']['title'] 59 videoTitle = change_title(videoTitle) 60 videoContentUrl = json_data['data']['moment']['videoInfo']['definitions'][0]['url'] 61 videoQuality = json_data['data']['moment']['videoInfo']['definitions'][0]['defName'] 62 self.save_video(videoContentUrl, videoTitle, videoQuality) 63 64 def run(self): 65 self.parse_category_page(self.url) 66 67 if __name__ == "__main__": 68 for page in range(1, 11): 69 print(f'--------------------------------正在采集第{page}页视频,请稍等!--------------------------------') 70 url = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}' 71 app = huyaVideoSpider(url=url) 72 app.run() 73 break
还有一种方式是根据输入下载指搜索的关键词和指定页的视频。参考:爬取酷我音乐平台的付费音乐的第二段面向对象代码。