python3下urlopen解析中文url编码错误
这是在ipython下测试的结果:
In [24]: x Out[24]: 'http://127.0.0.1:8000/xxx/?id=a45ex0bad3c9&game=五子棋' In [25]: urlopen(x) --------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call last) <ipython-input-25-441e6d61f53c> in <module>() ----> 1 urlopen(x) /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault) 154 else: 155 opener = _opener --> 156 return opener.open(url, data, timeout) 157 158 def install_opener(opener): /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in open(self, fullurl, data, timeout) 467 req = meth(req) 468 --> 469 response = self._open(req, data) 470 471 # post-process response /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in _open(self, req, data) 485 protocol = req.type 486 result = self._call_chain(self.handle_open, protocol, protocol + --> 487 '_open', req) 488 if result: 489 return result /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args) 445 for handler in handlers: 446 func = getattr(handler, meth_name) --> 447 result = func(*args) 448 if result is not None: 449 return result /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in http_open(self, req) 1272 1273 def http_open(self, req): -> 1274 return self.do_open(http.client.HTTPConnection, req) 1275 1276 http_request = AbstractHTTPHandler.do_request_ /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py in do_open(self, http_class, req, **http_conn_args) 1246 1247 try: -> 1248 h.request(req.get_method(), req.selector, req.data, headers) 1249 except socket.error as err: # timeout error 1250 h.close() /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/http/client.py in request(self, method, url, body, headers) 1063 def request(self, method, url, body=None, headers={}): 1064 """Send a complete request to the server.""" -> 1065 self._send_request(method, url, body, headers) 1066 1067 def _set_content_length(self, body): /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/http/client.py in _send_request(self, method, url, body, headers) 1091 skips['skip_accept_encoding'] = 1 1092 -> 1093 self.putrequest(method, url, **skips) 1094 1095 if body is not None and ('content-length' not in header_names): /Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/http/client.py in putrequest(self, method, url, skip_host, skip_accept_encoding) 955 956 # Non-ASCII characters should have been eliminated earlier --> 957 self._output(request.encode('ascii')) 958 959 if self._http_vsn == 11: UnicodeEncodeError: 'ascii' codec can't encode characters in position 46-48: ordinal not in range(128)
原因是因为中文字符串无法编码成为ascii码。
看了看后台提交的中文url,最后是转换为百分号编码模式来提交的。所以我们只需要对中文进行百分号编码之后,就可以encode为ascii了。
python3中我们可以找到这个编码的函数:
In [36]: from urllib.request import quote In [37]: quote('你好') Out[37]: '%E4%BD%A0%E5%A5%BD'