python requests模块 模拟请求的响应内容乱码问题(源码分析)
def request(url, data=None, get_or_post=None): try: if get_or_post: response = requests.post(url=url, data=data, headers=headers) else: if data: url = url + urlencode(data) response = requests.get(url=url, headers=headers) # print(response.headers) # {'Server': 'jfe', 'Date': 'Wed, 06 Mar 2019 05:01:58 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'xtest=3695.cf6b6759; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; domain=search.jd.com, ipLoc-djd=1-72-2799-0; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; path=/; domain=jd.com', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=86400'} # print(type(response)) # <class 'requests.models.Response'> # print(type(response.text)) # <class 'str'> # print(response.headers['content-type']) text/html # print(response.encoding) # ISO-8859-1#response内容的编码 # print(response.apparent_encoding) utf-8#response headers里设置的编码(即服务端返回的数据是用utf8格式编码的) # print(requests.utils.get_encodings_from_content(response.text)) ['utf-8']#response返回的html header标签里设置的编码 ''' class HTTPAdapter(BaseAdapter): # 接收到服务端的响应之后对服务端的响应进行处理,构造Response对象 def build_response(self, req, resp): response = Response() response.status_code = getattr(resp, 'status', None) response.encoding = get_encoding_from_headers(response.headers) response.encoding由下面的函数返回值赋值得到的,下面函数判断响应头中的content-type中有没有charset,如果有charset就将charset的值返回,如果没有则判断有没有text,如果有返回ISO-8859-1,而我们请求搜索页的时候content-type是没有charset的,只有text def get_encoding_from_headers(headers): """Returns encodings from given HTTP Header Dict. :param headers: dictionary to extract encoding from. :rtype: str """ content_type = headers.get('content-type') if not content_type: return None content_type, params = cgi.parse_header(content_type) if 'charset' in params: return params['charset'].strip("'\'") if 'text' in content_type: return 'ISO-8859-1' response.text是如何被编码的: class Response(object): @property def text(self): encoding = self.encoding # (response.encoding已被上面的函数赋值为ISO-8859-1) try: # 将服务端返回的响应体的内容(bytes类型)使用encoding(ISO-8859-1)的编码格式进行解码,解码成str类型 # 但是服务端返回的响应体的内容(bytes类型)是用utf-8编码生成的,用ISO-8859-1编码格式去进行解码成str类型,肯定会乱码 content = str(self.content, encoding, errors='replace') 总结:requests模块会根据响应头的content-type里的charset去设置响应体的编码格式,如果没有会给一个默认的编码格式ISO-8859-1, 但是服务端对响应体是用utf-8进行编码,编码成bytes类型返回的,然后你用ISO-8859-1去解码成str类型,肯定乱码(response.txt是ISO-8859-1编码格式的str类型) 解决方案:将上述过程逆向,将response.txt str类型使用ISO-8859-1编码格式编码成服务端原始返回的utf-8编码格式的bytes类型,然后再使用utf-8编码格式解码成str类型,即response.text.encode(response.encoding).decode(response.apparent_encoding),response.apparent_encoding就是服务端返回的响应头中设置编码格式,即服务端对返回的响应体(bytes类型)的编码格式,在本例中就是utf-8 ''' if response.status_code == 200: return response.text.encode(response.encoding).decode(response.apparent_encoding) return None except RequestException: print('请求' + url + '出错') return None
def search(keyword, page): url = "https://search.jd.com/Search?" data = { "keyword": keyword, "enc": "utf-8", "page": page, } html = request(url, data) return html
html = search('显卡', 2)