【Python学习】PycURL

#encoding=utf-8
import sys
import string
import time
import StringIO
import pycurl
import re
import pdb
import gzip
import threading


DOWNLOAD_HEADER = [
        'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
        'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language: en-us,en;q=0.5',
        'Accept-Encoding: gzip, deflate',
        'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Keep-Alive: 115',
        'Connection: keep-alive',
        'Cache-Control: max-age=0'
        ]
WORKER_SIZE = 4


class CurlResponse:
    def __init__(self, curl_instance, url, request_header=DOWNLOAD_HEADER, proxy=None):
        self.handle = curl_instance
        self.headers = {}
        self.request_url = url
        
        curl_instance.setopt(pycurl.URL, url)                                     #url
        curl_instance.setopt(pycurl.HTTPHEADER, request_header)                   #http header
        curl_instance.setopt(pycurl.FOLLOWLOCATION, 1)                            #redirect 301/302
        curl_instance.setopt(pycurl.MAXREDIRS, 5)                                 #max redirect count
        
        self.payload = ""                                                         #Nothing past here should be modified by the caller
        self.hdf = ""
        def payload_callback(x): self.payload += x                                #Set up a callback to capture the payload
        def hdf_callback(x): self.hdf += x                                        #Set up a callback to capture the header
        curl_instance.setopt(curl_instance.WRITEFUNCTION, payload_callback)       
        curl_instance.setopt(curl_instance.HEADERFUNCTION, hdf_callback)          

        if proxy:
            ip, port = proxy.split(":",  1)
            curl_instance.setopt(curl_instance.PROXY, ip)
            curl_instance.setopt(curl_instance.PROXYPORT, string.atoi(port))
            curl_instance.setopt(curl_instance.PROXYTYPE, curl_instance.PROXYTYPE_HTTP)

        curl_instance.setopt(curl_instance.COOKIEFILE, "COOKIES")
        curl_instance.setopt(curl_instance.COOKIEJAR, "COOKIES")
        try:
            curl_instance.perform()
        except curl_instance.error, e:
            # 47: Maximum (5) redirects followed
            # 56: Failure when receiving data from the peer
            if e[0] in (28, 47, 56):
                 print "TimeoutError"
            # 1: Protocol not supported or disabled in libcurl
            # 6: Can not resolve host
            # 7: No route to host
            elif e[0] in (1, 6, 7):
                print "HostResolvedError"
            else:
                print e
    
    def match_header(self, header):
        last_modified_regexp = re.compile(r'Last-Modified:\s*(.+)', re.I)
        content_type_regexp = re.compile(r'Content-Type:\s*(.+)(;)?', re.I)
        charset_header_regexp = re.compile(r'Content-Type:.*charset=(.+)', re.I)
        content_encoding_header_regexp = re.compile(r'Content-Encoding:\s*(.*)', re.I)
        def _match_header(name, pattern):
            match = pattern.search(header)
            if match:
                value = match.groups()[0]
                if value:
                    self.headers[name] = value.strip().lower()
        _match_header("Last-Modified", last_modified_regexp)
        _match_header("Content-Type", content_type_regexp)
        _match_header("charset", charset_header_regexp)
        _match_header("Content-Encoding", content_encoding_header_regexp)
        return self.headers

    def header(self):
        print self.hdf
        return self.match_header(self.hdf)

    def body(self):
        #pdb.set_trace()
        if ("Content-Encoding" in self.headers) or ("content-encoding" in self.headers):
            return gzip.GzipFile(fileobj=StringIO.StringIO(self.payload)).read()
        return self.payload

    def status(self):
        m = {}
        m['effective-url'] = self.handle.getinfo(pycurl.EFFECTIVE_URL)
        m['http-code'] = self.handle.getinfo(pycurl.HTTP_CODE)
        m['total-time'] = self.handle.getinfo(pycurl.TOTAL_TIME)
        m['namelookup-time'] = self.handle.getinfo(pycurl.NAMELOOKUP_TIME)
        m['connect-time'] = self.handle.getinfo(pycurl.CONNECT_TIME)
        m['pretransfer-time'] = self.handle.getinfo(pycurl.PRETRANSFER_TIME)
        m['redirect-time'] = self.handle.getinfo(pycurl.REDIRECT_TIME)
        m['redirect-count'] = self.handle.getinfo(pycurl.REDIRECT_COUNT)
        m['size-upload'] = self.handle.getinfo(pycurl.SIZE_UPLOAD)
        m['size-download'] = self.handle.getinfo(pycurl.SIZE_DOWNLOAD)
        m['speed-upload'] = self.handle.getinfo(pycurl.SPEED_UPLOAD)
        m['header-size'] = self.handle.getinfo(pycurl.HEADER_SIZE)
        m['request-size'] = self.handle.getinfo(pycurl.REQUEST_SIZE)
        m['content-length-download'] = self.handle.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD)
        m['content-length-upload'] = self.handle.getinfo(pycurl.CONTENT_LENGTH_UPLOAD)
        m['content-type'] = self.handle.getinfo(pycurl.CONTENT_TYPE)
        m['response-code'] = self.handle.getinfo(pycurl.RESPONSE_CODE)
        m['speed-download'] = self.handle.getinfo(pycurl.SPEED_DOWNLOAD)
        m['ssl-verifyresult'] = self.handle.getinfo(pycurl.SSL_VERIFYRESULT)
        m['filetime'] = self.handle.getinfo(pycurl.INFO_FILETIME)
        m['starttransfer-time'] = self.handle.getinfo(pycurl.STARTTRANSFER_TIME)
        m['http-connectcode'] = self.handle.getinfo(pycurl.HTTP_CONNECTCODE)
        m['httpauth-avail'] = self.handle.getinfo(pycurl.HTTPAUTH_AVAIL)
        m['proxyauth-avail'] = self.handle.getinfo(pycurl.PROXYAUTH_AVAIL)
        m['os-errno'] = self.handle.getinfo(pycurl.OS_ERRNO)
        m['num-connects'] = self.handle.getinfo(pycurl.NUM_CONNECTS)
        m['ssl-engines'] = self.handle.getinfo(pycurl.SSL_ENGINES)
        m['cookielist'] = self.handle.getinfo(pycurl.INFO_COOKIELIST)
        m['lastsocket'] = self.handle.getinfo(pycurl.LASTSOCKET)
        m['ftp-entry-path'] = self.handle.getinfo(pycurl.FTP_ENTRY_PATH)
        return m

def get(url, TIMEOUT=200, CONNTIMEOUT=30, request_headers=[], debug=False, proxy=None):
    crl = pycurl.Curl()
    if debug:
        crl.setopt(pycurl.VERBOSE,1)
    crl.setopt(pycurl.TIMEOUT, TIMEOUT)
    crl.setopt(pycurl.CONNECTTIMEOUT, CONNTIMEOUT)
    if type(url) == unicode: 
        url = str(url)
    return CurlResponse(crl, url, request_headers, proxy)

def downloader(link_job):
    return get(link_job.url,proxy=link_job.proxy)

class LinkJob:
    def __init__(self, url, proxy=None):
        self.url = url
        self.proxy = proxy
  
class Worker(threading.Thread):
    def __init__(self, links_queue, pages_queue, mutex):
        threading.Thread.__init__(self)
        self.links_queue = links_queue
        self.pages_queue = pages_queue
        self.mutex = mutex

    def run(self):
        try:
            while self.links_queue:
                self.mutex.acquire()
                link_job = self.links_queue.pop()
                self.mutex.release()
                if link_job and link_job.url:
                    response = downloader(link_job)
                    print "[%s] - [%s] -- [%s]"%(self.getName(), response.status().get("response-code"), link_job.url)
                    self.mutex.acquire()
                    self.pages_queue.append(response)
                    self.mutex.release()
        except Exception, e:
            print e


class Hunter():
    def __init__(self, links):
        self.workers = []
        self.pages_queue = []
        self.links_queue = links
        self.mutex = threading.Lock()

    def start(self):
        for _ in range(WORKER_SIZE):
            worker = Worker(self.links_queue, self.pages_queue, self.mutex)
            worker.setDaemon(True)
            worker.start()
            self.workers.append(worker)

    def exit(self):
        for worker in self.workers:
            worker.join()

    def work(self):
        self.start()
        self.exit()
        return self
    
    def show(self):
        if self.pages_queue:
            for page in self.pages_queue:
                print "[%s] - [%s] - [%s] - [%s]"%(page.status().get("response-code"), page.request_url, page.status().get("effective-url"), page.status().get("total-time"))
            print len(self.pages_queue)

if (__name__ == '__main__'):
    url1 = "http://theater.mtime.com/China_Beijing_Chaoyang/1009/"
    url2 = "http://www.jd.com/"
    url3 = "http://blog.chinaunix.net/uid-22920230-id-3074716.html"
    url4 = "http://www.dianping.com/hangzhou"
    url5 = "http://t.cn/Rhevig4"
    url6 = "http://esf.fang.com/agenthome/"
    links = []
    for i in range(500):
        if i%6==0:
            links.append(LinkJob(url1))
        elif i%6 == 1:
            links.append(LinkJob(url2))
        elif i%6 == 2:
            links.append(LinkJob(url3))
        elif i%6 == 3:
            links.append(LinkJob(url4))
        elif i%6 == 4:
            links.append(LinkJob(url5))
        elif i%6 == 5:
            links.append(LinkJob(url6))

    Hunter(links).work().show()
posted on 2015-05-07 20:03 有个姑娘叫小芳阅读(814) 评论(0) 编辑收藏举报
刷新页面返回顶部
atom_ye

【Python学习】PycURL

导航

公告