使用python网络库下载

下载1000次网页资源

1,普通循环方式下载1000次,非常慢

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import time
import urllib
import urllib2

total_times = 1000

def worker(url):
        try:
                f = urllib2.urlopen(url,timeout=10800)
                body = f.read()
        except:
                print sys.exc_info()
                return 0
        return 1

if __name__ == "__main__":

        for i in range(total_times):
                url = "http://web.kuaipan.cn/static/images/pc.png"
                worker(url)

#root:~/test # time ./c.py
#real    4m6.700s
#user    0m1.192s
#sys     0m1.736s

2,使用进程池下载,有点慢

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import time
import urllib
import urllib2
import multiprocessing

total_times = 1000

def worker(url):
        try:
                f = urllib2.urlopen(url,timeout=10800)
                body = f.read()
        except:
                print sys.exc_info()
                return 0
        return 1

if __name__ == "__main__":

        pool_size = multiprocessing.cpu_count() * 2
        pool = multiprocessing.Pool(processes=pool_size)

        for i in range(total_times):
                url = "http://web.kuaipan.cn/static/images/pc.png"
                pool.apply_async(worker, (url,))
                
        pool.close()
        pool.join()

#root:~/test # time ./pc.py
#real    1m43.668s
#user    0m1.480s
#sys     0m1.628s

3,使用twisted网络库,同样发起1000次请求,耗时减少为15s左右,性能提升很多,很快

#!/usr/bin/python

from sys import argv
from pprint import pformat

#from twisted.internet.task import react
from twisted.internet import reactor
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers

total_times = 1000
times = 0

def cbRequest(response):
    #print 'Response version:', response.version
    #print 'Response code:', response.code
    #print 'Response phrase:', response.phrase
    #print 'Response headers:'
    #print pformat(list(response.headers.getAllRawHeaders()))
    d = readBody(response)
    d.addCallback(cbBody)
    return d

def cbBody(body):
    #print 'Response body:'
    #print body
    data = body

def cbShutdown(ignored):
    global times
    times = times + 1
    if total_times - 1 < times:
        reactor.stop()

def curl(url):
    agent = Agent(reactor)
    d = agent.request(
        'GET', url,
        Headers({'User-Agent': ['Twisted Web Client Example']}),
        None)
    d.addCallback(cbRequest)
    d.addBoth(cbShutdown)
    return d

if __name__ == '__main__':

    for i in range(total_times):
        curl("http://web.kuaipan.cn/static/images/pc.png")
    
    reactor.run()

#root:~/test # time ./tc.py
#real    0m15.480s
#user    0m3.596s
#sys     0m0.720s

4,使用twisted网络库长连接,耗时也是很少,很快

#!/usr/bin/python

from sys import argv
from pprint import pformat

#from twisted.internet.task import react
from twisted.internet import reactor
from twisted.web.http_headers import Headers

from twisted.internet import reactor
from twisted.internet.defer import Deferred, DeferredList
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent, HTTPConnectionPool

total_times = 1000
times = 0

class IgnoreBody(Protocol):
    def __init__(self, deferred):
        self.deferred = deferred

    def dataReceived(self, bytes):
        pass

    def connectionLost(self, reason):
        self.deferred.callback(None)


def cbRequest(response):
    #print 'Response code:', response.code
    finished = Deferred()
    response.deliverBody(IgnoreBody(finished))
    return finished

pool = HTTPConnectionPool(reactor)
agent = Agent(reactor, pool=pool)

def requestGet(url):
    d = agent.request('GET', url)
    d.addCallback(cbRequest)
    return d

def cbShutdown(ignored):
    global times
    times = times + 1
    if total_times - 1 < times:
        reactor.stop()

def curl(url):
    agent = Agent(reactor)
    d = agent.request(
        'GET', url,
        Headers({'User-Agent': ['Twisted Web Client Example']}),
        None)
    d.addCallback(cbRequest)
    d.addBoth(cbShutdown)
    return d

for i in range(total_times):
    curl("http://web.kuaipan.cn/static/images/pc.png")

reactor.run()

#root:~/test # time ./tpc.py
#real    0m12.817s
#user    0m3.508s
#sys     0m0.528s

更多twisted参考:https://twistedmatrix.com/documents/current/web/howto/client.html#auto4

golang使用循环下载方式,和python使用循环下载方式耗时差不多,4分钟时间,瓶颈应该在网络

package main

import (
    "fmt"
    "net/http"
    "io/ioutil"
)

var totaltimes = 1000

func worker(url string) {
   response, err := http.Get(url)
   if err != nil {
       return
   }
   defer response.Body.Close()
   body, _ := ioutil.ReadAll(response.Body)
   fmt.Println(len(body))
}

func main() {

    for i := 0; i < totaltimes;i ++ {
          worker("http://web.kuaipan.cn/static/images/pc.png")
    }
}

//root:~/test # time ./got > goresult
//
//real    4m45.257s
//user    0m0.628s
//sys     0m0.632s

golang使用协程池方式模拟下载1000次,性能也要差很多(而且容易出现网络错误,最近出的go version go1.2rc4 linux/amd64要好一点 ,go1.1问题很多

package main

import (
    "fmt"
    "net/http"
    "io/ioutil"
    "sync"
)

var totaltimes = 1000
var poolsize = 250

func worker(linkChan chan string, wg *sync.WaitGroup) {
   // Decreasing internal counter for wait-group as soon as goroutine finishes
   defer wg.Done()

   for url := range linkChan {
       // Analyze value and do the job here
       response, err := http.Get(url)
       if err != nil {
           return
       }
       defer response.Body.Close()
       body, _ := ioutil.ReadAll(response.Body)
       fmt.Println(len(body))
       //fmt.Println("Resp code", response.StatusCode)
   }
}

func main() {
    var i int

    lCh := make(chan string)
    wg := new(sync.WaitGroup)
    // Adding routines to workgroup and running then
    for i := 0; i < poolsize; i++ {
        wg.Add(1)
        go worker(lCh, wg)
    }

    for i = 0; i < totaltimes;i ++ {
          lCh <- "http://web.kuaipan.cn/static/images/pc.png"
    }
    close(lCh)
    // Waiting for all goroutines to finish (otherwise they die as main routine dies)
    wg.Wait()
}

//root:~/test # time ./gotest > goresult
//
//real    0m25.250s
//user    0m0.772s
//sys     0m0.380s

twisted支持定时器,我们可以用来动态添加任务

from twisted.web.client import getPage
from twisted.internet import reactor

class Getter(object):

    def __init__(self):
        self._sequence = 0
        self._results = []
        self._errors = []

    def add(self, url):
        d = getPage(url)
        d.addCallbacks(self._on_success, self._on_error)
        d.addCallback(self._on_finish)
        self._sequence += 1

    def _on_finish(self, *narg):
        self._sequence -= 1
        print len(self._results), len(self._errors)
     #   if not self._sequence:
     #       reactor.stop()

    _on_success = lambda self, *res: self._results.append(res)
    _on_error = lambda self, *err: self._errors.append(err)

    def run(self):
        reactor.run()
        return self._results, self._errors

def jobtimer():
    for url in ('http://www.google.com', 'http://www.yahoo.com', 'http://www.baidu.com'):
        g.add(url)
    reactor.callLater(1,jobtimer)

reactor.callLater(2,jobtimer) #定时添加任务
g = Getter()
results, errors = g.run()

#print len(results)
#print len(errors)
posted @ 2013-12-02 21:51  ciaos  阅读(611)  评论(0编辑  收藏  举报