urllib源码简单分析
对下面这段代码做分析
import urllib params = urllib.urlencode({'wd': 'python'}) f = urllib.urlopen("http://www.baidu.com/s?%s" % params) print f.read()
这是一段简单读取url内容的代码
此处最关键的是urlopen,通过查看,可以看到urlopen的代码如下
def urlopen(url, data=None, proxies=None): """Create a file-like object for the specified URL to read from.""" from warnings import warnpy3k warnpy3k("urllib.urlopen() has been removed in Python 3.0 in " "favor of urllib2.urlopen()", stacklevel=2) global _urlopener if proxies is not None: opener = FancyURLopener(proxies=proxies) elif not _urlopener: opener = FancyURLopener() _urlopener = opener else: opener = _urlopener if data is None: return opener.open(url) else: return opener.open(url, data)
通过一个FancyURLopener的opener实例,因为这里没有proxies参数,所以调用到opener = FancyURLopener()这一句。
然后返回opener.open(url),绑定到f实例上。在这里,有两个关键,一个是opener实例,一个是open方法。
先来说说opener,opener是FancyURLopener()的对象,而FancyURLopener的父类是URLopener基类,而FancyURLopener这个类本身只做了一些http的异常响应处理,因此我们需要了解核心的基类,也就是看看URLopener到底做了什么?
URLopener:通过查看源码,发现URLopener的主要处理方法是open。
def open(self, fullurl, data=None): """Use URLopener().open(file) instead of open(file, 'r').""" fullurl = unwrap(toBytes(fullurl)) # percent encode url, fixing lame server errors for e.g, like space # within url paths. fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") if self.tempcache and fullurl in self.tempcache: filename, headers = self.tempcache[fullurl] fp = open(filename, 'rb') return addinfourl(fp, headers, fullurl) urltype, url = splittype(fullurl) if not urltype: urltype = 'file' if urltype in self.proxies: proxy = self.proxies[urltype] urltype, proxyhost = splittype(proxy) host, selector = splithost(proxyhost) url = (host, fullurl) # Signal special case to open_*() else: proxy = None name = 'open_' + urltype self.type = urltype name = name.replace('-', '_') if not hasattr(self, name): if proxy: return self.open_unknown_proxy(proxy, fullurl, data) else: return self.open_unknown(fullurl, data) try: if data is None: return getattr(self, name)(url) else: return getattr(self, name)(url, data) except socket.error, msg: raise IOError, ('socket error', msg), sys.exc_info()[2]
open通过处理url,将name拼接成name = 'open_' + urltype的格式,也就是说,如果是http请求,则name为open_http。在上文那段代码里,最后调用返回的是getattr(self, name)(url),而由于name变成了open_http,则继续调用open_http方法。在这里可以看出,urllib根据你的type来给出不同的方法作处理。
那么又要看看open_http干了什么.
通过debug发现,open_http其实是用httplib来做底层处理的
def open_http(self, url, data=None): """Use HTTP protocol.""" import httplib user_passwd = None proxy_passwd= None if isinstance(url, str): host, selector = splithost(url) if host: user_passwd, host = splituser(host) host = unquote(host) realhost = host else: host, selector = url # check whether the proxy contains authorization information proxy_passwd, host = splituser(host) # now we proceed with the url we want to obtain urltype, rest = splittype(selector) url = rest user_passwd = None if urltype.lower() != 'http': realhost = None else: realhost, rest = splithost(rest) if realhost: user_passwd, realhost = splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) if proxy_bypass(realhost): host = realhost #print "proxy via http:", host, selector if not host: raise IOError, ('http error', 'no host given') if proxy_passwd: proxy_passwd = unquote(proxy_passwd) proxy_auth = base64.b64encode(proxy_passwd).strip() else: proxy_auth = None if user_passwd: user_passwd = unquote(user_passwd) auth = base64.b64encode(user_passwd).strip() else: auth = None h = httplib.HTTP(host) if data is not None: h.putrequest('POST', selector) h.putheader('Content-Type', 'application/x-www-form-urlencoded') h.putheader('Content-Length', '%d' % len(data)) else: h.putrequest('GET', selector) if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) if auth: h.putheader('Authorization', 'Basic %s' % auth) if realhost: h.putheader('Host', realhost) for args in self.addheaders: h.putheader(*args) h.endheaders(data) errcode, errmsg, headers = h.getreply() fp = h.getfile()
红色加粗为核心处理部分。可以看出,这是通过切割host和请求参数后来对服务器发起请求并处理response的过程。
到目前为止,我们可以发现,真正向服务器发起请求的是这一句:h.putrequest('GET', selector).
那么继续追踪定位,
hdr = '%s %s %s' % (method, url, self._http_vsn_str),实际hdr为:'GET /s?wd=python HTTP/1.0',然后输出self._output(hdr),而这个_output的作用是向当前请求缓冲区添加一行输出。然后通过以下方法返回buffer中的内容放置在一个fp的对象里。
try: if not buffering: response = self._conn.getresponse() else: #only add this keyword if non-default for compatibility #with other connection classes response = self._conn.getresponse(buffering) except BadStatusLine, e: ### hmm. if getresponse() ever closes the socket on a bad request, ### then we are going to have problems with self.sock ### should we keep this behavior? do people use it? # keep the socket open (as a file), and return it self.file = self._conn.sock.makefile('rb', 0) # close our socket -- we want to restart after any protocol error self.close() self.headers = None return -1, e.line, None
最后通过一个迭代器不断读回文件内容。
class addbase: """Base class for addinfo and addclosehook.""" def __init__(self, fp): self.fp = fp self.read = self.fp.read self.readline = self.fp.readline if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno else: self.fileno = lambda: None if hasattr(self.fp, "__iter__"): self.__iter__ = self.fp.__iter__ if hasattr(self.fp, "next"): self.next = self.fp.next ...