异步多线程下载网页爬取的视频
程序改进如下:::
import requests
import re # 正则模块
import uuid # uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串
# 导入线程池模块
from concurrent.futures import ThreadPoolExecutor
# 线程(os的资源,一个进程可以开多个线程),但是无法限制好线程数;
# 线程池:帮助控制好线程数
pool = ThreadPoolExecutor(50)
# 爬虫三部曲
# 1、发送请求
def get_page(url):
print("开始异步任务:{url}")
response = requests.get(url)
return response
# 2、解析数据
# 解析主页获取视频详情页ID
def parse_index(res):
response = res.result()
res = re.findall('<a href="video_(.*?)', response.text, re.S)
# print(res)
# detail_url_list = []
for m_id in res:
# 拼接详情页url
detail_url = 'https://www.pearvideo.com/video_' + m_id
# 把详情页url提交给get_page
pool.submit(get_page, detail_url).add_done_callback(parse_detail)
# detail_url_list.append(detail_url)
# return detail_url_list
##解析详情页获取视频url
def parse_detail(res):
response = res.result()
movie_url = re.findall('srcUrl ="(.*?)" ', response.text, re.S)[0]
pool.submit(get_page, movie_url).add_done_callback(save_movie)
# return movie_url
# 3、保存数据
def save_movie(res):
movie_res = res.result()
# # 把视频写到本地
with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
f.write(movie_res.content)
f.flush()
# 测试:调用函数并实现爬取
if __name__ == '__main__': # mian+回车键
# 往get_page发送异步请求,把结果交给parse_index
url = 'https://www.pearvideo.com/'
pool.submit(get_page, url).add_done_callback(parse_index) # 异步提交
执行代码如下:
import requests import re # 正则模块 import uuid # uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串 # 导入线程池模块 from concurrent.futures import ThreadPoolExecutor # 线程(os的资源,一个进程可以开多个线程),但是无法限制好线程数; # 线程池:帮助控制好线程数 pool = ThreadPoolExecutor(50) # 爬虫三部曲 # 1、发送请求 def get_page(url): print("开始异步任务:{url}") response = requests.get(url) return response # 2、解析数据 # 解析主页获取视频详情页ID def parse_index(res): response = res.result() res1= re.findall('<a href="video_(.*?)"', response.text, re.S) print(res1) # detail_url_list = [] for m_id in res1: # 拼接详情页url detail_url = 'https://www.pearvideo.com/video_' + m_id # print(detail_url) # 把详情页url提交给get_page pool.submit(get_page, detail_url).add_done_callback(parse_detail) # detail_url_list.append(detail_url) # return detail_url_list # # ##解析详情页获取视频url def parse_detail(res): response = res.result() print(response.text) movie_url = re.findall('srcUrl="(.*?)" ', response.text, re.S)[0] print(movie_url) pool.submit(get_page, movie_url).add_done_callback(save_movie) # return movie_url # 3、保存数据 def save_movie(res): movie_res=res.result() # # 把视频写到本地 with open(f'{uuid.uuid4()}.mp4', 'wb') as f: f.write(movie_res.content) f.flush() # 测试:调用函数并实现爬取 if __name__ == '__main__': # mian+回车键 # 往get_page发送异步请求,把结果交给parse_index url = 'https://www.pearvideo.com/' pool.submit(get_page, url).add_done_callback(parse_index) # 异步提交 # print(pool.submit(get_page, url))
数据传递过程,如下几图: