大文件分片下载并发

import requests
from concurrent.futures import ThreadPoolExecutor
import os
from pathlib import Path
from loguru import logger
import traceback
from tqdm import tqdm
from threading import Lock
from functools import partial

# url = "https://scontent.xx.fbcdn.net/m1/v/t6/An_YmP5OIPXun-vu3hkckAZZ2s4lPYoVkiyvCcWiVY21mu1Ng5_1HeCa2CWiSTsskj8HQ8bN013HxNpYDdSC_7jWQq_svcg.tar?ccb=10-5&oh=00_AfBn8XrMhiHu6w1KuS1X8rkuLzzZJnRs8B9jFMvVRfQnfg&oe=64659C28&_nc_sid=fb0754"
#
# tar_path = Path("/Users/chennan/Desktop/sa_000020.tar")
# fetching_path = Path(f"{tar_path.as_posix()}.fetch")
lock = Lock()
downloaded = 0
url = "https://images.pexels.com/photos/15983035/pexels-photo-15983035.jpeg"
tar_path = Path("/Users/chennan/Desktop/pexels-photo-15983035.jpeg")
fetching_path = Path(f"{tar_path.as_posix()}.fetch")
pbar_threads = []

def insert_data(info):
    headers, start, end = info
    pbar_thread = tqdm(total=end - start)
    pbar_threads.append(pbar_thread)
    with requests.get(url, stream=True, headers=headers) as response:
        with tar_path.open('rb+') as f:  # path 文件保存路径
            # 从文件的start位置开始写入
            f.seek(start)
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                # pbar.update(len(chunk))
                pbar_thread.update(len(chunk))
            pbar_thread.close()


def get_file_length():
    # 改为requests.head可能更高效,待测试。
    req = requests.get(url, stream=True)
    return int(req.headers['content-Length'])


def fetch_one(content_length):
    all_thread = 64  # 线程数量
    part = content_length // all_thread  # 每个线程请求的大小
    for i in range(all_thread):
        # 每个线程开始, 结束爬取的位置
        start = part * i
        if i == all_thread - 1:
            end = content_length
        else:
            end = start + part
        if i > 0:
            start += 1
        # 设置从网站请求开始和结束的位置
        headers = {
            'Range': f'bytes={start}-{end}',
        }
        yield headers, start, end


if __name__ == '__main__':
    content_length = get_file_length()
    if not tar_path.exists():
        with tar_path.open('wb') as f:
            f.seek(content_length - 1)
            f.write(b'\0')
    if not fetching_path.exists():
        with fetching_path.open("wb") as fs:
            fs.write(b"\0")

    try:
        with ThreadPoolExecutor(max_workers=16) as pool:
            pool.map(insert_data, fetch_one(content_length))
        with tqdm(total=content_length) as pbar:
            progress = sum(pbar_thread.n for pbar_thread in pbar_threads)
            pbar.update(progress)
        fetching_path.unlink()
        logger.info("下载完成")
    except Exception:
        logger.error(f"{traceback.format_exc()}")

posted @ 2023-05-05 23:59  公众号python学习开发  阅读(25)  评论(0编辑  收藏  举报