Python下如何下载超大文件???
如何通过链接下载一个大文件,大概10G???
要快速
在写爬虫的过程中常常遇到下载大文件的情况,比如说视频之类的。如果只是传统的直接下载保存,速度就比较慢,所有就想写个多线程同步下载大文件的模块。
使用到的库
模块中使用到的库都比较简单:requests(写爬虫的都知道吧),threading(多线程,必须的),os(文件操作需要用到)。
主要的难点
一个是多线程下载的同步问题,另一个是文件中断下载后,再次开始从上次中断的地方继续下载。
其实我觉得就这两个问题,迅雷之类的下载器早就已经给我们做了个解决方法事例,那就是在下载文件的路径添加一个管理下载进度的文件。
具体实现
模块中有两个类,一个负责处理管理文件的更新和创建等,还有一个是线程任务,发下载请求,写入文件等。
管理文件类
文件内的格式使用的是很简单的用“=”分割的配置文件的形式,包括四个配置信息,分别是:【已写入的字节】range,【未写入的字节】range,【写入中的字节】range和文件下载的url。之前所说的比较难处理的问题都是用这个管理文件解决的。
配置文件
writing_range=[(42991616, 46137344)] unwritten_range=[(46137344, 10633872234)] written_range=[(0, 42991616)] url_in_file=https:xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
主要思路是,刚开始下载时创建这个文件,获取带下载文件的大小,并填入【未下载字节】中去。然后其他线程同步的不断从这个【未下载字节】的中提取一小部分写到【下载中字节】,发请求下载,并写入到文件中去,之后再把已经下载好的字节写入到【已下载字节】中去。因为考虑到下载的字节可能时一段一段分隔开的,所有写成图中所示的形式。
其实对range的分割,合并(也就是这个类的主要功能)还是需要一些小技巧的,如果想要自己先练练的话可以前往https://leetcode.com/problems/insert-interval/
之前也是写的时候感觉很熟悉,发现在leetcode上做过类似的题目。
多线程下载类
这个类就没有什么比较复杂的处理了,主要就是读取待下载字节的同步,获得文件大小,下载文件Range。(在http(s)中,文件下载是分了很多次请求的,每次请求的headers中带上Range可以指明需要下载文件的哪一部分,格式为: Range: bytes=1024-2048 )获取文件大小可以先发一个Range : bytes=0-0 这样的请求过去,响应中的header会带有content-Range的头部(如果他需要的话,一般都会有),这个值就是文件的总大小。
多线程同步,就用普通的锁就好了(threading.Lock类),文件内容的写入和配置的读取都需要。写入到文件的指定位置用的是file的seek函数,不清楚的可以百度一下,就跟c里面的移动文件位置指针一样。
对了,外面还有一个创建这些下载线程的守护线程。
总结
这样子,大文件的多线程下载和中断续传功能就得以实现了。有兴趣的话可以自己写一下,挺有意思的。需要参考的话-https://github.com/HBertram/python-grab
# -*- coding: utf-8 -*- """ Created on Mon Jul 1 16:47:38 2019 @author: Administrator """ # -*- coding: utf-8 -*- #单下载链接多任务下载 import requests import threading import os import time def download(url, filename, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3559.6 Safari/537.36'}): t = DownloadWorkerThread(url, filename, headers=headers) t.start() return t #处理单个下载线程 class DownloadWorkerThread(threading.Thread): thread_count = 5 file_lock = threading.Lock() fileinfo_lock = threading.Lock() def __init__(self, url, filename, headers = {}, thread_count = 3): threading.Thread.__init__(self) self.filename = filename self.url = url self.fileinfo_name = filename + ".tmp" self.headers = headers self.thread_count = thread_count def run(self): self.range_manager = self.read_range_file() print(u"Begin Downloading \nurl= " + self.url + "\nfilename = " + self.filename) if self.url.strip() == "": return tlst = [] for i in range(self.thread_count): t = threading.Thread(target = self.RangeWorker, args=(self,)) print(u"Start Thread :" + t.getName()) t.setDaemon(True) t.start() tlst.append(t) for t in tlst: t.join() def write_content(self, content, content_range): self.file_lock.acquire() with open(self.filename, 'rb+') as f: f.seek(content_range[0]) f.write(content) self.file_lock.release() self.fileinfo_lock.acquire() self.range_manager.set_written_range(content_range) self.fileinfo_lock.release() def read_next_range(self): self.fileinfo_lock.acquire() time.sleep(0.1) r = self.range_manager.get_unwritten_range() self.fileinfo_lock.release() return r def read_range_file(self): self.fileinfo_lock.acquire() manager = None if os.path.exists(self.fileinfo_name): print("read filename " + self.fileinfo_name) manager = DownloadWorkerThread.FileInfoManager(self.fileinfo_name, url = self.url) self.content_length = manager.get_total_length() if self.url.strip() == "": self.url = manager.url_in_file else: self.content_length = self.get_content_length() print("create filename_info length:" + str(self.content_length)) with open(self.filename, "wb+") as f: f.seek(self.content_length) manager = DownloadWorkerThread.FileInfoManager(self.fileinfo_name, url = self.url, filesize = self.content_length) self.fileinfo_lock.release() return manager def get_content_length(self): headers = self.headers headers['Range'] = "bytes=0-1" length = 0 while length < 1024*1024*3: time.sleep(3) length = int(requests.get(self.url, headers=self.headers).headers['content-Range'].split('/')[1]) print("Get length " + str(length)) return length def RangeWorker(self, downloadWorker): while True: content_range = downloadWorker.read_next_range() if content_range == 0: os.remove(self.fileinfo_name) print(self.filename + " finished") break headers = downloadWorker.headers headers['Range'] = "bytes=" + str(content_range[0]) + "-" + str(content_range[1]-1) while True: iTryTimes = 0 r = requests.get(downloadWorker.url, headers=headers) if r.ok: downloadWorker.write_content(r.content, content_range) print("We are working on " + self.filename + " and now processing : " + \ str(round(1.0*content_range[1]/self.content_length*100,2)) + "% in size " + str(round(self.content_length/1024.0/1024.0,2)) + "MB.") break else: iTryTimes += 1 if iTryTimes > 1: print("Downloading " + downloadWorker.url + " error. Now Exit Thread.") return class FileInfoManager(): url_in_file = "" writing_range = [] written_range = [] unwritten_range = [] def __init__(self, filename, url = "", filesize = 0): self.filename = filename if not os.path.exists(filename): with open(filename, "w") as f: f.write("unwritten_range=[(0," + str(filesize) + ")]\r\n") f.write("writing_range=[]\r\n") f.write("written_range=[]\r\n") f.write("url_in_file=" + url) self.unwritten_range.append((0,filesize)) self.url_in_file = url else: with open(filename, "r") as f: for l in f.readlines(): typ = l.split("=")[0] if typ == "writing_range": typ = "unwritten_range" elif typ == "url_in_file": if url.strip() == "": self.url_in_file = l.split("=")[1] else: self.url_in_file = url continue for tup in l.split("=")[1][1:-3].split('),'): if tup == "": continue if tup.find("(") != 0: tup = tup[tup.find("("):] if tup.find(")") != 0: tup = tup[:tup.find(")")] getattr(self, typ).append(\ (int(tup.split(",")[0][1:]),int(tup.split(",")[1]))) def get_total_length(self): if len(self.unwritten_range) > 0: return self.unwritten_range[-1][1] elif len(self.writing_range) > 0: return self.writing_range[-1][1] elif len(self.written_range) > 0: return self.written_range[-1][1] return 0 def _save_to_file(self): with open(self.filename, "w") as f: f.write("writing_range=" + \ str(self.writing_range) + "\r\n") f.write("unwritten_range=" + \ str(self.unwritten_range) + "\r\n") f.write("written_range=" + \ str(self.written_range) + "\r\n") f.write("url_in_file=" + self.url_in_file) def _splice(self, intervals, newInterval): if len(intervals) == 0: return [] intervals = self._concat(intervals, (0,0)) response = [] for interval in intervals: if interval[0] == interval[1]: continue if interval[0] > newInterval[1]: response.append(interval) elif interval[1] < newInterval[0]: response.append(interval) else: max_range = (min(interval[0], newInterval[0]), max(interval[1], newInterval[1])) if max_range != newInterval: left = (min(max_range[0], newInterval[0]), max(max_range[0], newInterval[0])) right = (min(max_range[1], newInterval[1]), max(max_range[1], newInterval[1])) if left[0] != left[1]: response.append(left) if right[0] != right[1]: response.append(right) return response def _concat(self, intervals, newInterval): if len(intervals) == 0: return [newInterval] response = [newInterval] for interval in intervals: i = response.pop() if interval[0] == interval[1]: continue if i[0] > interval[1]: response.append(interval) response.append(i) elif i[1] < interval[0]: response.append(i) response.append(interval) else: response.append((min(i[0], interval[0]), max(i[1], interval[1]))) return response def get_unwritten_range(self, size = 1024*1024): if len(self.unwritten_range) == 0: return 0 r = self.unwritten_range[0] r = (r[0], min(r[0]+size, r[1])) self.unwritten_range = self._splice(self.unwritten_range, r) self.writing_range = self._concat(self.writing_range, r) self._save_to_file() return r def set_written_range(self, content_range): self.writing_range = self._splice(self.writing_range, content_range) self.written_range = self._concat(self.written_range, content_range) self._save_to_file() #t = DownloadWorkerThread(r'http://a3.kuaihou.com/ruanjian/ucdnb.zip',\ # 'd:\\ucdnb.zip', \ # headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3559.6 Safari/537.36'}) #t.start() if __name__ == '__main__': url = input(u"The URL Waiting for downloading:") filename = input(u"The Filepath to save:") t = download(url, filename) while t.is_alive(): time.sleep(60) print("bye")
链接:https://www.jianshu.com/p/e0f42bd3a3ea
来源:简书
简书著作权归作者所有,任何形式的转载都请联系作者获得授权并注明出处。