【Python学习】PycURL
#encoding=utf-8
import sys
import string
import time
import StringIO
import pycurl
import re
import pdb
import gzip
import threading
DOWNLOAD_HEADER = [
'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: en-us,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive: 115',
'Connection: keep-alive',
'Cache-Control: max-age=0'
]
WORKER_SIZE = 4
class CurlResponse:
def __init__(self, curl_instance, url, request_header=DOWNLOAD_HEADER, proxy=None):
self.handle = curl_instance
self.headers = {}
self.request_url = url
curl_instance.setopt(pycurl.URL, url) #url
curl_instance.setopt(pycurl.HTTPHEADER, request_header) #http header
curl_instance.setopt(pycurl.FOLLOWLOCATION, 1) #redirect 301/302
curl_instance.setopt(pycurl.MAXREDIRS, 5) #max redirect count
self.payload = "" #Nothing past here should be modified by the caller
self.hdf = ""
def payload_callback(x): self.payload += x #Set up a callback to capture the payload
def hdf_callback(x): self.hdf += x #Set up a callback to capture the header
curl_instance.setopt(curl_instance.WRITEFUNCTION, payload_callback)
curl_instance.setopt(curl_instance.HEADERFUNCTION, hdf_callback)
if proxy:
ip, port = proxy.split(":", 1)
curl_instance.setopt(curl_instance.PROXY, ip)
curl_instance.setopt(curl_instance.PROXYPORT, string.atoi(port))
curl_instance.setopt(curl_instance.PROXYTYPE, curl_instance.PROXYTYPE_HTTP)
curl_instance.setopt(curl_instance.COOKIEFILE, "COOKIES")
curl_instance.setopt(curl_instance.COOKIEJAR, "COOKIES")
try:
curl_instance.perform()
except curl_instance.error, e:
# 47: Maximum (5) redirects followed
# 56: Failure when receiving data from the peer
if e[0] in (28, 47, 56):
print "TimeoutError"
# 1: Protocol not supported or disabled in libcurl
# 6: Can not resolve host
# 7: No route to host
elif e[0] in (1, 6, 7):
print "HostResolvedError"
else:
print e
def match_header(self, header):
last_modified_regexp = re.compile(r'Last-Modified:\s*(.+)', re.I)
content_type_regexp = re.compile(r'Content-Type:\s*(.+)(;)?', re.I)
charset_header_regexp = re.compile(r'Content-Type:.*charset=(.+)', re.I)
content_encoding_header_regexp = re.compile(r'Content-Encoding:\s*(.*)', re.I)
def _match_header(name, pattern):
match = pattern.search(header)
if match:
value = match.groups()[0]
if value:
self.headers[name] = value.strip().lower()
_match_header("Last-Modified", last_modified_regexp)
_match_header("Content-Type", content_type_regexp)
_match_header("charset", charset_header_regexp)
_match_header("Content-Encoding", content_encoding_header_regexp)
return self.headers
def header(self):
print self.hdf
return self.match_header(self.hdf)
def body(self):
#pdb.set_trace()
if ("Content-Encoding" in self.headers) or ("content-encoding" in self.headers):
return gzip.GzipFile(fileobj=StringIO.StringIO(self.payload)).read()
return self.payload
def status(self):
m = {}
m['effective-url'] = self.handle.getinfo(pycurl.EFFECTIVE_URL)
m['http-code'] = self.handle.getinfo(pycurl.HTTP_CODE)
m['total-time'] = self.handle.getinfo(pycurl.TOTAL_TIME)
m['namelookup-time'] = self.handle.getinfo(pycurl.NAMELOOKUP_TIME)
m['connect-time'] = self.handle.getinfo(pycurl.CONNECT_TIME)
m['pretransfer-time'] = self.handle.getinfo(pycurl.PRETRANSFER_TIME)
m['redirect-time'] = self.handle.getinfo(pycurl.REDIRECT_TIME)
m['redirect-count'] = self.handle.getinfo(pycurl.REDIRECT_COUNT)
m['size-upload'] = self.handle.getinfo(pycurl.SIZE_UPLOAD)
m['size-download'] = self.handle.getinfo(pycurl.SIZE_DOWNLOAD)
m['speed-upload'] = self.handle.getinfo(pycurl.SPEED_UPLOAD)
m['header-size'] = self.handle.getinfo(pycurl.HEADER_SIZE)
m['request-size'] = self.handle.getinfo(pycurl.REQUEST_SIZE)
m['content-length-download'] = self.handle.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD)
m['content-length-upload'] = self.handle.getinfo(pycurl.CONTENT_LENGTH_UPLOAD)
m['content-type'] = self.handle.getinfo(pycurl.CONTENT_TYPE)
m['response-code'] = self.handle.getinfo(pycurl.RESPONSE_CODE)
m['speed-download'] = self.handle.getinfo(pycurl.SPEED_DOWNLOAD)
m['ssl-verifyresult'] = self.handle.getinfo(pycurl.SSL_VERIFYRESULT)
m['filetime'] = self.handle.getinfo(pycurl.INFO_FILETIME)
m['starttransfer-time'] = self.handle.getinfo(pycurl.STARTTRANSFER_TIME)
m['http-connectcode'] = self.handle.getinfo(pycurl.HTTP_CONNECTCODE)
m['httpauth-avail'] = self.handle.getinfo(pycurl.HTTPAUTH_AVAIL)
m['proxyauth-avail'] = self.handle.getinfo(pycurl.PROXYAUTH_AVAIL)
m['os-errno'] = self.handle.getinfo(pycurl.OS_ERRNO)
m['num-connects'] = self.handle.getinfo(pycurl.NUM_CONNECTS)
m['ssl-engines'] = self.handle.getinfo(pycurl.SSL_ENGINES)
m['cookielist'] = self.handle.getinfo(pycurl.INFO_COOKIELIST)
m['lastsocket'] = self.handle.getinfo(pycurl.LASTSOCKET)
m['ftp-entry-path'] = self.handle.getinfo(pycurl.FTP_ENTRY_PATH)
return m
def get(url, TIMEOUT=200, CONNTIMEOUT=30, request_headers=[], debug=False, proxy=None):
crl = pycurl.Curl()
if debug:
crl.setopt(pycurl.VERBOSE,1)
crl.setopt(pycurl.TIMEOUT, TIMEOUT)
crl.setopt(pycurl.CONNECTTIMEOUT, CONNTIMEOUT)
if type(url) == unicode:
url = str(url)
return CurlResponse(crl, url, request_headers, proxy)
def downloader(link_job):
return get(link_job.url,proxy=link_job.proxy)
class LinkJob:
def __init__(self, url, proxy=None):
self.url = url
self.proxy = proxy
class Worker(threading.Thread):
def __init__(self, links_queue, pages_queue, mutex):
threading.Thread.__init__(self)
self.links_queue = links_queue
self.pages_queue = pages_queue
self.mutex = mutex
def run(self):
try:
while self.links_queue:
self.mutex.acquire()
link_job = self.links_queue.pop()
self.mutex.release()
if link_job and link_job.url:
response = downloader(link_job)
print "[%s] - [%s] -- [%s]"%(self.getName(), response.status().get("response-code"), link_job.url)
self.mutex.acquire()
self.pages_queue.append(response)
self.mutex.release()
except Exception, e:
print e
class Hunter():
def __init__(self, links):
self.workers = []
self.pages_queue = []
self.links_queue = links
self.mutex = threading.Lock()
def start(self):
for _ in range(WORKER_SIZE):
worker = Worker(self.links_queue, self.pages_queue, self.mutex)
worker.setDaemon(True)
worker.start()
self.workers.append(worker)
def exit(self):
for worker in self.workers:
worker.join()
def work(self):
self.start()
self.exit()
return self
def show(self):
if self.pages_queue:
for page in self.pages_queue:
print "[%s] - [%s] - [%s] - [%s]"%(page.status().get("response-code"), page.request_url, page.status().get("effective-url"), page.status().get("total-time"))
print len(self.pages_queue)
if (__name__ == '__main__'):
url1 = "http://theater.mtime.com/China_Beijing_Chaoyang/1009/"
url2 = "http://www.jd.com/"
url3 = "http://blog.chinaunix.net/uid-22920230-id-3074716.html"
url4 = "http://www.dianping.com/hangzhou"
url5 = "http://t.cn/Rhevig4"
url6 = "http://esf.fang.com/agenthome/"
links = []
for i in range(500):
if i%6==0:
links.append(LinkJob(url1))
elif i%6 == 1:
links.append(LinkJob(url2))
elif i%6 == 2:
links.append(LinkJob(url3))
elif i%6 == 3:
links.append(LinkJob(url4))
elif i%6 == 4:
links.append(LinkJob(url5))
elif i%6 == 5:
links.append(LinkJob(url6))
Hunter(links).work().show()