该方法实现网页编码的自动识别和转换
"""
该方法实现网页编码的自动识别和转换
"""
# python 第三方库chardet不可靠,把gbk编码解析成 Windows-1254
@retry(stop_max_attempt_number=5, wait_random_min=2000, wait_random_max=20000, )
def page_trancode(content):
codes = chardet.detect(content)
if codes['encoding'] == "utf-8":
return content
if codes['encoding'] == "gbk":
return content.decode('gbk', 'ignore').encode('utf-8')
if codes['encoding'] in "GB2312":
return str(BeautifulSoup(content, 'html.parser', fromEncoding="GBK"))
if codes['encoding'] in "unicode":
return content.encode('utf-8').decode('unicode_escape')
else:
return content
如果觉得对您有帮助,麻烦您点一下推荐,谢谢!
好记忆不如烂笔头
好记忆不如烂笔头