Python爬取视频(其实是一篇福利)
窗外下着小雨,作为单身程序员的我逛着逛着发现一篇好东西,来自知乎 你都用 Python 来做什么?的第一个高亮答案。
到上面去看了看,地址都是明文的,得,赶紧开始吧。
下载流式文件,requests库中请求的stream设为True就可以啦,文档在此。
先找一个视频地址试验一下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | # -*- coding: utf-8 -*- import requests def download_file(url, path): with requests.get(url, stream = True ) as r: chunk_size = 1024 content_size = int (r.headers[ 'content-length' ]) print '下载开始' with open (path, "wb" ) as f: for chunk in r.iter_content(chunk_size = chunk_size): f.write(chunk) if __name__ = = '__main__' : url = '就在原帖...' path = '想存哪都行' download_file(url, path) |
遭遇当头一棒:
1 | AttributeError: __exit__ |
这文档也会骗人的么!
看样子是没有实现上下文需要的__exit__方法。既然只是为了保证要让r最后close以释放连接池,那就使用contextlib的closing特性好了:
1 2 3 4 5 6 7 8 9 10 11 12 | # -*- coding: utf-8 -*- import requests from contextlib import closing def download_file(url, path): with closing(requests.get(url, stream = True )) as r: chunk_size = 1024 content_size = int (r.headers[ 'content-length' ]) print '下载开始' with open (path, "wb" ) as f: for chunk in r.iter_content(chunk_size = chunk_size): f.write(chunk) |
程序正常运行了,不过我盯着这文件,怎么大小不见变啊,到底是完成了多少了呢?还是要让下好的内容及时存进硬盘,还能省点内存是不是:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | # -*- coding: utf-8 -*- import requests from contextlib import closing import os def download_file(url, path): with closing(requests.get(url, stream = True )) as r: chunk_size = 1024 content_size = int (r.headers[ 'content-length' ]) print '下载开始' with open (path, "wb" ) as f: for chunk in r.iter_content(chunk_size = chunk_size): f.write(chunk) f.flush() os.fsync(f.fileno()) |
文件以肉眼可见的速度在增大,真心疼我的硬盘,还是最后一次写入硬盘吧,程序中记个数就好了:
1 2 3 4 5 6 7 8 9 10 11 12 | def download_file(url, path): with closing(requests.get(url, stream = True )) as r: chunk_size = 1024 content_size = int (r.headers[ 'content-length' ]) print '下载开始' with open (path, "wb" ) as f: n = 1 for chunk in r.iter_content(chunk_size = chunk_size): loaded = n * 1024.0 / content_size f.write(chunk) print '已下载{0:%}' . format (loaded) n + = 1 |
结果就很直观了:
1 2 3 4 | 已下载 2.579129 % 已下载 2.581255 % 已下载 2.583382 % 已下载 2.585508 % |
心怀远大理想的我怎么会只满足于这一个呢,写个类一起使用吧:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | # -*- coding: utf-8 -*- import requests from contextlib import closing import time def download_file(url, path): with closing(requests.get(url, stream = True )) as r: chunk_size = 1024 * 10 content_size = int (r.headers[ 'content-length' ]) print '下载开始' with open (path, "wb" ) as f: p = ProgressData(size = content_size, unit = 'Kb' , block = chunk_size) for chunk in r.iter_content(chunk_size = chunk_size): f.write(chunk) p.output() class ProgressData( object ): def __init__( self , block,size, unit, file_name = '', ): self .file_name = file_name self .block = block / 1000.0 self .size = size / 1000.0 self .unit = unit self .count = 0 self .start = time.time() def output( self ): self .end = time.time() self .count + = 1 speed = self .block / ( self .end - self .start) if ( self .end - self .start)> 0 else 0 self .start = time.time() loaded = self .count * self .block progress = round (loaded / self .size, 4 ) if loaded > = self .size: print u '%s下载完成\r\n' % self .file_name else : print u '{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s' .\ format ( self .file_name, loaded, self .unit,\ self .size, self .unit, progress, speed, self .unit) print '%50s' % ( '/' * int (( 1 - progress) * 50 )) |
运行:
1 2 3 4 5 | 下载开始 下载进度 10.24Kb / 120174.05Kb 0.01 % 下载速度 4.75Kb / s / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / 下载进度 20.48Kb / 120174.05Kb 0.02 % 下载速度 32.93Kb / s / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / |
看上去舒服多了。
下面要做的就是多线程同时下载了,主线程生产url放入队列,下载线程获取url:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | # -*- coding: utf-8 -*- import requests from contextlib import closing import time import Queue import hashlib import threading import os def download_file(url, path): with closing(requests.get(url, stream = True )) as r: chunk_size = 1024 * 10 content_size = int (r.headers[ 'content-length' ]) if os.path.exists(path) and os.path.getsize(path)> = content_size: print '已下载' return print '下载开始' with open (path, "wb" ) as f: p = ProgressData(size = content_size, unit = 'Kb' , block = chunk_size, file_name = path) for chunk in r.iter_content(chunk_size = chunk_size): f.write(chunk) p.output() class ProgressData( object ): def __init__( self , block,size, unit, file_name = '', ): self .file_name = file_name self .block = block / 1000.0 self .size = size / 1000.0 self .unit = unit self .count = 0 self .start = time.time() def output( self ): self .end = time.time() self .count + = 1 speed = self .block / ( self .end - self .start) if ( self .end - self .start)> 0 else 0 self .start = time.time() loaded = self .count * self .block progress = round (loaded / self .size, 4 ) if loaded > = self .size: print u '%s下载完成\r\n' % self .file_name else : print u '{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s' .\ format ( self .file_name, loaded, self .unit,\ self .size, self .unit, progress, speed, self .unit) print '%50s' % ( '/' * int (( 1 - progress) * 50 )) queue = Queue.Queue() def run(): while True : url = queue.get(timeout = 100 ) if url is None : print u '全下完啦' break h = hashlib.md5() h.update(url) name = h.hexdigest() path = 'e:/download/' + name + '.mp4' download_file(url, path) def get_url(): queue.put( None ) if __name__ = = '__main__' : get_url() for i in xrange ( 4 ): t = threading.Thread(target = run) t.daemon = True t.start() |
加了重复下载的判断,至于怎么源源不断的生产url,诸位摸索吧,保重身体!
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· .NET Core 托管堆内存泄露/CPU异常的常见思路
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验