Python requests.Session 协程 下载文件

Python requests.Session 协程 下载文件

 

复制代码
# coding: utf-8

from gevent import monkey

monkey.patch_all()
from gevent.pool import Pool
import gevent
import requests
import os, sys
import time
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class Downloader:
    def __init__(self, pool_size, retry=3):
        self.pool = Pool(pool_size)
        self.session = self._get_http_session(pool_size, pool_size, retry)
        self.retry = retry  # 重试次数
        self.dir = ''
        self.failed = []
        self.url_total = 0
        self.completed_count = 0

    def _get_http_session(self, pool_connections, pool_maxsize, max_retries):
        session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_connections=pool_connections, pool_maxsize=pool_maxsize,
                                                max_retries=max_retries)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        return session

    def run(self, url_list, dir=''):
        self.dir = dir
        if self.dir and not os.path.isdir(self.dir):
            os.makedirs(self.dir)

        self.url_total = len(url_list)
        self.completed_count = 0
        print ('total ts count:', self.url_total)
        g1 = gevent.spawn(self._check_finish)
        self._download(url_list)
        g1.join()

    def _download(self, url_list):
        self.pool.map(self._worker, url_list)
        if self.failed:
            url_list = self.failed
            self.failed = []
            self._download(url_list)

    def _worker(self, url):

        retry = self.retry
        while retry:
            try:
                file_name = url.split('/')[-1].split('?')[0]
                file_path = os.path.join(self.dir, file_name)
                if not os.path.exists(file_path):
                    r = self.session.get(url, timeout=20, verify=False)
                    if r.ok:
                        print ('download:', file_name)
                        with open(file_path, 'wb') as f:
                            f.write(r.content)
                    else:
                        # print('fail:', file_name)
                        raise RuntimeError('download fail')
                else:
                    print('exist:', file_name)
                return
            except:
                retry -= 1
        print ('[FAIL]%s' % url)
        self.failed.append(url)
        self.completed_count += 1

    def _check_finish(self):

        while self.completed_count >= self.url_total:
            time.sleep(0.01)


if __name__ == '__main__':
    downloader = Downloader(50)  # 协程个数

    url_list = ['https://pics1.baidu.com/feed/b999a9014c086e0610f3d6bf8bf4d6ff08d1cbf7.jpeg',
                'https://pics7.baidu.com/feed/d53f8794a4c27d1ef06a7b6195290065dfc438ca.jpeg']
    downloader.run(url_list, './dst_dir')
复制代码

 

posted on   星河赵  阅读(198)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
历史上的今天:
2018-02-06 Docker 常用命令

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示