使用python网络库下载
下载1000次网页资源
1,普通循环方式下载1000次,非常慢
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import os import time import urllib import urllib2 total_times = 1000 def worker(url): try: f = urllib2.urlopen(url,timeout=10800) body = f.read() except: print sys.exc_info() return 0 return 1 if __name__ == "__main__": for i in range(total_times): url = "http://web.kuaipan.cn/static/images/pc.png" worker(url) #root:~/test # time ./c.py #real 4m6.700s #user 0m1.192s #sys 0m1.736s
2,使用进程池下载,有点慢
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import os import time import urllib import urllib2 import multiprocessing total_times = 1000 def worker(url): try: f = urllib2.urlopen(url,timeout=10800) body = f.read() except: print sys.exc_info() return 0 return 1 if __name__ == "__main__": pool_size = multiprocessing.cpu_count() * 2 pool = multiprocessing.Pool(processes=pool_size) for i in range(total_times): url = "http://web.kuaipan.cn/static/images/pc.png" pool.apply_async(worker, (url,)) pool.close() pool.join() #root:~/test # time ./pc.py #real 1m43.668s #user 0m1.480s #sys 0m1.628s
3,使用twisted网络库,同样发起1000次请求,耗时减少为15s左右,性能提升很多,很快
#!/usr/bin/python from sys import argv from pprint import pformat #from twisted.internet.task import react from twisted.internet import reactor from twisted.web.client import Agent, readBody from twisted.web.http_headers import Headers total_times = 1000 times = 0 def cbRequest(response): #print 'Response version:', response.version #print 'Response code:', response.code #print 'Response phrase:', response.phrase #print 'Response headers:' #print pformat(list(response.headers.getAllRawHeaders())) d = readBody(response) d.addCallback(cbBody) return d def cbBody(body): #print 'Response body:' #print body data = body def cbShutdown(ignored): global times times = times + 1 if total_times - 1 < times: reactor.stop() def curl(url): agent = Agent(reactor) d = agent.request( 'GET', url, Headers({'User-Agent': ['Twisted Web Client Example']}), None) d.addCallback(cbRequest) d.addBoth(cbShutdown) return d if __name__ == '__main__': for i in range(total_times): curl("http://web.kuaipan.cn/static/images/pc.png") reactor.run() #root:~/test # time ./tc.py #real 0m15.480s #user 0m3.596s #sys 0m0.720s
4,使用twisted网络库长连接,耗时也是很少,很快
#!/usr/bin/python from sys import argv from pprint import pformat #from twisted.internet.task import react from twisted.internet import reactor from twisted.web.http_headers import Headers from twisted.internet import reactor from twisted.internet.defer import Deferred, DeferredList from twisted.internet.protocol import Protocol from twisted.web.client import Agent, HTTPConnectionPool total_times = 1000 times = 0 class IgnoreBody(Protocol): def __init__(self, deferred): self.deferred = deferred def dataReceived(self, bytes): pass def connectionLost(self, reason): self.deferred.callback(None) def cbRequest(response): #print 'Response code:', response.code finished = Deferred() response.deliverBody(IgnoreBody(finished)) return finished pool = HTTPConnectionPool(reactor) agent = Agent(reactor, pool=pool) def requestGet(url): d = agent.request('GET', url) d.addCallback(cbRequest) return d def cbShutdown(ignored): global times times = times + 1 if total_times - 1 < times: reactor.stop() def curl(url): agent = Agent(reactor) d = agent.request( 'GET', url, Headers({'User-Agent': ['Twisted Web Client Example']}), None) d.addCallback(cbRequest) d.addBoth(cbShutdown) return d for i in range(total_times): curl("http://web.kuaipan.cn/static/images/pc.png") reactor.run() #root:~/test # time ./tpc.py #real 0m12.817s #user 0m3.508s #sys 0m0.528s
更多twisted参考:https://twistedmatrix.com/documents/current/web/howto/client.html#auto4
golang使用循环下载方式,和python使用循环下载方式耗时差不多,4分钟时间,瓶颈应该在网络
package main import ( "fmt" "net/http" "io/ioutil" ) var totaltimes = 1000 func worker(url string) { response, err := http.Get(url) if err != nil { return } defer response.Body.Close() body, _ := ioutil.ReadAll(response.Body) fmt.Println(len(body)) } func main() { for i := 0; i < totaltimes;i ++ { worker("http://web.kuaipan.cn/static/images/pc.png") } } //root:~/test # time ./got > goresult // //real 4m45.257s //user 0m0.628s //sys 0m0.632s
golang使用协程池方式模拟下载1000次,性能也要差很多(而且容易出现网络错误,最近出的go version go1.2rc4 linux/amd64要好一点 ,go1.1问题很多)
package main import ( "fmt" "net/http" "io/ioutil" "sync" ) var totaltimes = 1000 var poolsize = 250 func worker(linkChan chan string, wg *sync.WaitGroup) { // Decreasing internal counter for wait-group as soon as goroutine finishes defer wg.Done() for url := range linkChan { // Analyze value and do the job here response, err := http.Get(url) if err != nil { return } defer response.Body.Close() body, _ := ioutil.ReadAll(response.Body) fmt.Println(len(body)) //fmt.Println("Resp code", response.StatusCode) } } func main() { var i int lCh := make(chan string) wg := new(sync.WaitGroup) // Adding routines to workgroup and running then for i := 0; i < poolsize; i++ { wg.Add(1) go worker(lCh, wg) } for i = 0; i < totaltimes;i ++ { lCh <- "http://web.kuaipan.cn/static/images/pc.png" } close(lCh) // Waiting for all goroutines to finish (otherwise they die as main routine dies) wg.Wait() } //root:~/test # time ./gotest > goresult // //real 0m25.250s //user 0m0.772s //sys 0m0.380s
twisted支持定时器,我们可以用来动态添加任务
from twisted.web.client import getPage from twisted.internet import reactor class Getter(object): def __init__(self): self._sequence = 0 self._results = [] self._errors = [] def add(self, url): d = getPage(url) d.addCallbacks(self._on_success, self._on_error) d.addCallback(self._on_finish) self._sequence += 1 def _on_finish(self, *narg): self._sequence -= 1 print len(self._results), len(self._errors) # if not self._sequence: # reactor.stop() _on_success = lambda self, *res: self._results.append(res) _on_error = lambda self, *err: self._errors.append(err) def run(self): reactor.run() return self._results, self._errors def jobtimer(): for url in ('http://www.google.com', 'http://www.yahoo.com', 'http://www.baidu.com'): g.add(url) reactor.callLater(1,jobtimer) reactor.callLater(2,jobtimer) #定时添加任务 g = Getter() results, errors = g.run() #print len(results) #print len(errors)