Python HTML特殊符号的转义与反转义
需求:在做Web开发过程中,经常遇到特殊符号需要转义为浏览器认为是字符串的数据,减少前端的攻击。
注意:此代码来源Tornado源码
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import html.entities import typing from typing import Union, Optional, Dict _TO_UNICODE_TYPES = (str, type(None)) def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811 """将字节转为字符串""" if isinstance(value, _TO_UNICODE_TYPES): return value if not isinstance(value, bytes): raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) return value.decode("utf-8") _XHTML_ESCAPE_RE = re.compile("[&<>\"']") _XHTML_ESCAPE_DICT = { "&": "&", "<": "<", ">": ">", '"': """, "'": "'", } def xhtml_escape(value: Union[str, bytes]) -> str: """将特殊符号:``<``, ``>``, ``"``, ``'``, and ``&``,进行转义""" return _XHTML_ESCAPE_RE.sub( lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_unicode(value) ) def _build_unicode_map() -> Dict[str, str]: """ 打印出html所有的特殊符号与转义后的简称 :return: """ unicode_map = {} for name, value in html.entities.name2codepoint.items(): unicode_map[name] = chr(value) return unicode_map _HTML_UNICODE_MAP = _build_unicode_map() def _convert_entity(m: typing.Match) -> str: """ re.sub回调函数 """ if m.group(1) == "#": try: if m.group(2)[:1].lower() == "x": return chr(int(m.group(2)[1:], 16)) else: return chr(int(m.group(2))) except ValueError: return "&#%s;" % m.group(2) try: return _HTML_UNICODE_MAP[m.group(2)] except KeyError: return "&%s;" % m.group(2) def xhtml_unescape(value: Union[str, bytes]) -> str: """将转义字符,返转义为特殊符号.""" return re.sub(r"&(#?)(\w+?);", _convert_entity, to_unicode(value)) if __name__ == '__main__': src_text = '<script>alert(1)</script>' ret_escape = xhtml_escape(src_text) print(ret_escape) reback = xhtml_unescape(ret_escape) print(reback) """ 输出结果: <script>alert(1)</script> <script>alert(1)</script> """