python标准库之urllib, httplib, urllib2
urllib.urlencode()
urllib2.urlopen()
urlib2学习指南:
查看源码可知是这样的路线。
数据流向
urllib2.urlopen() ->opener.open() -> Request() -> _open() -> 等等 -> httplib库
由此大致可以得出,要访问一个网址,可以使用如下几种方法。
urlopen()
opener.open()
Request()之后使用urlopen或者operner.open()
也就是说Request()这个类不提供open的操作,只是对数据的封装。
class Request: def __init__(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False): # unwrap('<URL:type://host/path>') --> 'type://host/path' self.__original = unwrap(url) self.__original, self.__fragment = splittag(self.__original) self.type = None # self.__r_type is what's left after doing the splittype self.host = None self.port = None self._tunnel_host = None self.data = data self.headers = {} for key, value in headers.items(): self.add_header(key, value) self.unredirected_hdrs = {} if origin_req_host is None: origin_req_host = request_host(self) self.origin_req_host = origin_req_host self.unverifiable = unverifiable def __getattr__(self, attr): # XXX this is a fallback mechanism to guard against these # methods getting called in a non-standard order. this may be # too complicated and/or unnecessary. # XXX should the __r_XXX attributes be public? if attr[:12] == '_Request__r_': name = attr[12:] if hasattr(Request, 'get_' + name): getattr(self, 'get_' + name)() return getattr(self, attr) raise AttributeError, attr def get_method(self): if self.has_data(): return "POST" else: return "GET" # XXX these helper methods are lame def add_data(self, data): self.data = data def has_data(self): return self.data is not None def get_data(self): return self.data def get_full_url(self): if self.__fragment: return '%s#%s' % (self.__original, self.__fragment) else: return self.__original def get_type(self): if self.type is None: self.type, self.__r_type = splittype(self.__original) if self.type is None: raise ValueError, "unknown url type: %s" % self.__original return self.type def get_host(self): if self.host is None: self.host, self.__r_host = splithost(self.__r_type) if self.host: self.host = unquote(self.host) return self.host def get_selector(self): return self.__r_host def set_proxy(self, host, type): if self.type == 'https' and not self._tunnel_host: self._tunnel_host = self.host else: self.type = type self.__r_host = self.__original self.host = host def has_proxy(self): return self.__r_host == self.__original def get_origin_req_host(self): return self.origin_req_host def is_unverifiable(self): return self.unverifiable def add_header(self, key, val): # useful for something like authentication self.headers[key.capitalize()] = val def add_unredirected_header(self, key, val): # will not be added to a redirected request self.unredirected_hdrs[key.capitalize()] = val def has_header(self, header_name): return (header_name in self.headers or header_name in self.unredirected_hdrs) def get_header(self, header_name, default=None): return self.headers.get( header_name, self.unredirected_hdrs.get(header_name, default)) def header_items(self): hdrs = self.unredirected_hdrs.copy() hdrs.update(self.headers) return hdrs.items()
opener是OpenDirector()的实例。
可以通过build_opener()函数, 也可以通过OpenerDirector实例化一个opener,然后调用add_handler方法。正所谓条条大道通罗马。
install_opener()可以让opener对urlopen()也生效
def install_opener(opener): global _opener _opener = opener
global
语句被用来声明
xxx
是全局的——因此,当我们在函数内把值赋给
xxx
的时候,这个变化也反映在我们在主块中使用
x
的值的时候
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, cafile=None, capath=None, cadefault=False, context=None): global _opener
something