urllib结合 concurrent.futures 多线程下载文件。
示例:
#!/usr/bin/env python3 # -*- coding:utf-8 -*- # @Time: 2020/12/16 10:42 # @Author:zhangmingda # @File: urllib_multi_download.py # @Software: PyCharm # Description: 使用urllib 模块 实现多线程下载某个文件测试 from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.request import urlopen from urllib.request import Request from urllib.request import quote import json import math import os class DownLoader(object): def __init__(self): self.part_size = 1024 * 1024 * 10 # 分块下载大小 self.part_thread_num = 10 self.BUFFER_SIZE = 64 * 1024 def download_part(self, encode_url, part_filename, offset, end_bytes): """ :param encode_url:经过URL编码的网络地址 :param part_filename: 文件块儿名字 :param offset: 下载字节起始点(包含) :param end_bytes: 下载字节结束点(包含) :return: (下载结果) """ # 构造请求头 range_header = { 'Range': 'bytes=%s-%s' % (offset, end_bytes) } print(range_header) cur_task_ret = False expected_file_size = end_bytes - offset + 1 part_req = Request(encode_url,headers=range_header) with open(part_filename, 'wb') as local_part_fd: with urlopen(part_req) as req_fd: while True: # 一直从网络读数据 data = req_fd.read(self.BUFFER_SIZE) if not data: break local_part_fd.write(data) if expected_file_size == os.stat(part_filename).st_size: print('%s 与预期块儿文件大小相符' % part_filename) cur_task_ret = True # break else: print('%s 与预期块儿文件大小 不符,预期%s字节,实际得到%s 字节' % ( part_filename, expected_file_size, os.stat(part_filename).st_size)) return {part_filename: cur_task_ret} def download(self, url): finally_filename = os.path.basename(url) # 将URL编码成%字符串格式 encode_url = quote(url, safe=";/?:@&=+$,") print(encode_url) # 构造请求 req = Request(encode_url) # 发起请求并且获取内容长度 with urlopen(req) as fp: # print(json.dumps(dir(fp),indent=1)) print(fp.getheaders()) # length = fp.getheader('content-Range') length = fp.getheader('Content-Length') length = int(length) print(type(length)) print('length:', length) # 分块任务列表 thread_list = [] # 每个块儿下载的结果 multi_chunk_download_result = {} chunk_size = self.part_size # 计算需要下载的块儿个数 chunk_count = int(math.ceil(length / float(chunk_size))) pool_args_list = [] # 计算每个块儿请求的字节范围 for i in range(chunk_count): offset = chunk_size * i end_bytes = min(chunk_size * (i + 1), length) - 1 # 将一个文件划分的所有块儿任务,添加到任务列表 part_num = i + 1 part_filename = finally_filename + '.' + str(part_num) # 每个块儿请求的范围,块儿名字,加到线程参数列表 pool_args_list.append((encode_url, part_filename, offset, end_bytes)) # ********开始多线程下载数据,并获取下载结果************** # 构建线程池实例 tp = ThreadPoolExecutor(max_workers=self.part_thread_num) # 全部添加到任务队列开始处理 [thread_list.append(tp.submit(self.download_part, *args)) for args in pool_args_list] # 等待所有线程结束,获取全部线程的执行结果 [multi_chunk_download_result.update(part_thread.result()) for part_thread in as_completed(thread_list)] # 下载总结 print('下载总结') # 如果任务数和块儿数对不上,报一下出入 if len(multi_chunk_download_result) != chunk_count: raise RuntimeError( "%s part miss,expect=%d,actual=%d" % (finally_filename, chunk_count, len(multi_chunk_download_result))) # 如果任务都完毕,检查是否有失败的块儿 for item in multi_chunk_download_result.keys(): if not multi_chunk_download_result[item]: raise RuntimeError("%s part upload has fail" % item) # 都OK 整合文件 with open(finally_filename, 'wb') as local_fd: for i in range(chunk_count): part_filename = finally_filename + '.' + str(i + 1) with open(part_filename, 'rb') as part_fd: while True: bytes_data = part_fd.read(self.BUFFER_SIZE) if not bytes_data: break local_fd.write(bytes_data) if length == os.stat(finally_filename).st_size: print('%s 下载完成,文件大小相符' % finally_filename) for part_filename in multi_chunk_download_result.keys(): os.remove(part_filename) else: print('%s 下载完成,但大小不符,content_length:%s 下载后大小 %s' % (finally_filename, length,os.stat(finally_filename).st_size )) if __name__ == '__main__': downloader = DownLoader() url = 'https://ks3-cn-beijing.ksyun.com/zhangmingda/111-3333333.Python安装与命令行操作.mp4' print(url) downloader.download(url)
posted on 2020-12-16 15:09 zhangmingda 阅读(226) 评论(0) 编辑 收藏 举报