html--特殊字符过滤

最近碰到关于部分特殊的html编码乱码问题,所以整理了一个比较全的php版本,记录下来纪念毕业吧.

class HtmlFilter
{

    /**
     * 过滤字符串中的特殊字符
     * @static
     * @param $content
     * @return string
     */
    public static function filterSpacialHtmlChar($content)
    {

        if (empty($content)) {
            return '';
        }
        $content=self::delAllSpace($content);
        $content=self::replaceHtmlAndJs($content);

        return strip_tags($content);
    }


    /**
     * 生成摘要
     * @static
     * @param $content
     * @param $len
     * @param string $char
     * @return string
     */
    public static  function getSummary($content,$len=100,$char='UTF-8'){

        if(empty($content)){
            return '';
        }

        if($len>=mb_strlen($content)){
          return self::filterSpacialHtmlChar($content);
        }

        return mb_substr(self::filterSpacialHtmlChar($content),0,$len,$char).'...';
    }

    /**
     * 去掉 $str中的特殊字符
     * @static
     * @param $document
     * @return mixed|string
     */
    public static function replaceHtmlAndJs($document)
    {
        $document = trim($document);
        if (strlen($document) <= 0) {
            return $document;
        }


        $search = array("'<script[^>]*?>.*?</script>'si");
        $replace = array("");
        $htmlCharArr=self::getSpecialHtmlArr();
        foreach($htmlCharArr as $hc){

            $replace[]=$hc[0];
            $search[]="'&(".$hc[1]."|".$hc[2]."|".$hc[3].");'i";

        }

        return @preg_replace($search, $replace, $document);
    }


    /**
     * 删除空格
     * @static
     * @param $str
     * @return mixed
     */
    public static function delAllSpace($str)

    {
        $preStr = array(" ", " ", "\t", "\n", "\r" );
        $afterStr = array("", "", "", "", "",);
        return str_replace($preStr, $afterStr, $str);
    }


    /**
     * 2012-07-05 by long
     * 特殊 Html 代码集合
     * $sh $sh[0] 显示的字符
     * $sh $sh[1] $sh[2] 显示的字符的html编码
     * $sh $sh[3] 显示的字符的注释
     * @static
     * @return array
     */
    public static function getSpecialHtmlArr(){

        $sh[]=array("","nbsp","#160","no-break space = non-breaking space");
        $sh[]=array("¡","iexcl","#161","inverted exclamation mark");
        $sh[]=array("¢","cent","#162","cent sign");
        $sh[]=array("£","pound","#163","pound sign");
        $sh[]=array("¤","curren","#164","currency sign");
        $sh[]=array("¥","yen","#165","yen sign = yuan sign");
        $sh[]=array("¦","brvbar","#166","broken bar = broken vertical bar");
        $sh[]=array("§","sect","#167","section sign");
        $sh[]=array("¨","uml","#168","diaeresis = spacing diaeresis");
        $sh[]=array("©","copy","#169","copyright sign");
        $sh[]=array("ª","ordf","#170","feminine ordinal indicator");
        $sh[]=array("«","laquo","#171","left-pointing double angle quotation mark = left pointing guillemet");
        $sh[]=array("¬","not","#172","not sign");
        $sh[]=array("­","shy","#173","soft hyphen = discretionary hyphen");
        $sh[]=array("®","reg","#174","registered sign = registered trade mark sign");
        $sh[]=array("¯","macr","#175","macron = spacing macron = overline = APL overbar");
        $sh[]=array("°","deg","#176","degree sign");
        $sh[]=array("±","plusmn","#177","plus-minus sign = plus-or-minus sign");
        $sh[]=array("²","sup2","#178","superscript two = superscript digit two = squared");
        $sh[]=array("³","sup3","#179","superscript three = superscript digit three = cubed");
        $sh[]=array("´","acute","#180","acute accent = spacing acute");
        $sh[]=array("µ","micro","#181","micro sign");
        $sh[]=array("¶","para","#182","pilcrow sign = paragraph sign");
        $sh[]=array("·","middot","#183","middle dot = Georgian comma = Greek middle dot");
        $sh[]=array("¸","cedil","#184","cedilla = spacing cedilla");
        $sh[]=array("¹","sup1","#185","superscript one = superscript digit one");
        $sh[]=array("º","ordm","#186","masculine ordinal indicator");
        $sh[]=array("»","raquo","#187","right-pointing double angle quotation mark = right pointing guillemet");
        $sh[]=array("¼","frac14","#188","vulgar fraction one quarter = fraction one quarter");
        $sh[]=array("½","frac12","#189","vulgar fraction one half = fraction one half");
        $sh[]=array("¾","frac34","#190","vulgar fraction three quarters = fraction three quarters");
        $sh[]=array("¿","iquest","#191","inverted question mark = turned question mark");
        $sh[]=array("À","Agrave","#192","latin capital letter A with grave = latin capital letter A grave");
        $sh[]=array("Á","Aacute","#193","latin capital letter A with acute");
        $sh[]=array("Â","Acirc","#194","latin capital letter A with circumflex");
        $sh[]=array("Ã","Atilde","#195","latin capital letter A with tilde");
        $sh[]=array("Ä","Auml","#196","latin capital letter A with diaeresis");
        $sh[]=array("Å","Aring","#197","latin capital letter A with ring above = latin capital letter A ring");
        $sh[]=array("Æ","AElig","#198","latin capital letter AE = latin capital ligature AE");
        $sh[]=array("Ç","Ccedil","#199","latin capital letter C with cedilla");
        $sh[]=array("È","Egrave","#200","latin capital letter E with grave");
        $sh[]=array("É","Eacute","#201","latin capital letter E with acute");
        $sh[]=array("Ê","Ecirc","#202","latin capital letter E with circumflex");
        $sh[]=array("Ë","Euml","#203","latin capital letter E with diaeresis");
        $sh[]=array("Ì","Igrave","#204","latin capital letter I with grave");
        $sh[]=array("Í","Iacute","#205","latin capital letter I with acute");
        $sh[]=array("Î","Icirc","#206","latin capital letter I with circumflex");
        $sh[]=array("Ï","Iuml","#207","latin capital letter I with diaeresis");
        $sh[]=array("Ð","ETH","#208","latin capital letter ETH");
        $sh[]=array("Ñ","Ntilde","#209","latin capital letter N with tilde");
        $sh[]=array("Ò","Ograve","#210","latin capital letter O with grave");
        $sh[]=array("Ó","Oacute","#211","latin capital letter O with acute");
        $sh[]=array("Ô","Ocirc","#212","latin capital letter O with circumflex");
        $sh[]=array("Õ","Otilde","#213","latin capital letter O with tilde");
        $sh[]=array("Ö","Ouml","#214","latin capital letter O with diaeresis");
        $sh[]=array("×","times","#215","multiplication sign");
        $sh[]=array("Ø","Oslash","#216","latin capital letter O with stroke = latin capital letter O slash");
        $sh[]=array("Ù","Ugrave","#217","latin capital letter U with grave");
        $sh[]=array("Ú","Uacute","#218","latin capital letter U with acute");
        $sh[]=array("Û","Ucirc","#219","latin capital letter U with circumflex");
        $sh[]=array("Ü","Uuml","#220","latin capital letter U with diaeresis");
        $sh[]=array("Ý","Yacute","#221","latin capital letter Y with acute");
        $sh[]=array("Þ","THORN","#222","latin capital letter THORN");
        $sh[]=array("ß","szlig","#223","latin small letter sharp s = ess-zed");
        $sh[]=array("à","agrave","#224","latin small letter a with grave = latin small letter a grave");
        $sh[]=array("á","aacute","#225","latin small letter a with acute");
        $sh[]=array("â","acirc","#226","latin small letter a with circumflex");
        $sh[]=array("ã","atilde","#227","latin small letter a with tilde");
        $sh[]=array("ä","auml","#228","latin small letter a with diaeresis");
        $sh[]=array("å","aring","#229","latin small letter a with ring above = latin small letter a ring");
        $sh[]=array("æ","aelig","#230","latin small letter ae = latin small ligature ae");
        $sh[]=array("ç","ccedil","#231","latin small letter c with cedilla");
        $sh[]=array("è","egrave","#232","latin small letter e with grave");
        $sh[]=array("é","eacute","#233","latin small letter e with acute");
        $sh[]=array("ê","ecirc","#234","latin small letter e with circumflex");
        $sh[]=array("ë","euml","#235","latin small letter e with diaeresis");
        $sh[]=array("ì","igrave","#236","latin small letter i with grave");
        $sh[]=array("í","iacute","#237","latin small letter i with acute");
        $sh[]=array("î","icirc","#238","latin small letter i with circumflex");
        $sh[]=array("ï","iuml","#239","latin small letter i with diaeresis");
        $sh[]=array("ð","eth","#240","latin small letter eth");
        $sh[]=array("ñ","ntilde","#241","latin small letter n with tilde");
        $sh[]=array("ò","ograve","#242","latin small letter o with grave");
        $sh[]=array("ó","oacute","#243","latin small letter o with acute");
        $sh[]=array("ô","ocirc","#244","latin small letter o with circumflex");
        $sh[]=array("õ","otilde","#245","latin small letter o with tilde");
        $sh[]=array("ö","ouml","#246","latin small letter o with diaeresis");
        $sh[]=array("÷","divide","#247","division sign");
        $sh[]=array("ø","oslash","#248","latin small letter o with stroke, = latin small letter o slash");
        $sh[]=array("ù","ugrave","#249","latin small letter u with grave");
        $sh[]=array("ú","uacute","#250","latin small letter u with acute");
        $sh[]=array("û","ucirc","#251","latin small letter u with circumflex");
        $sh[]=array("ü","uuml","#252","latin small letter u with diaeresis");
        $sh[]=array("ý","yacute","#253","latin small letter y with acute");
        $sh[]=array("þ","thorn","#254","latin small letter thorn");
        $sh[]=array("ÿ","yuml","#255","latin small letter y with diaeresis");
        $sh[]=array("ƒ","fnof","#402","latin small f with hook = function = florin");
        $sh[]=array("Α","Alpha","#913","greek capital letter alpha");
        $sh[]=array("Β","Beta","#914","greek capital letter beta");
        $sh[]=array("Γ","Gamma","#915","greek capital letter gamma");
        $sh[]=array("Δ","Delta","#916","greek capital letter delta");
        $sh[]=array("Ε","Epsilon","#917","greek capital letter epsilon");
        $sh[]=array("Ζ","Zeta","#918","greek capital letter zeta");
        $sh[]=array("Η","Eta","#919","greek capital letter eta");
        $sh[]=array("Θ","Theta","#920","greek capital letter theta");
        $sh[]=array("Ι","Iota","#921","greek capital letter iota");
        $sh[]=array("Κ","Kappa","#922","greek capital letter kappa");
        $sh[]=array("Λ","Lambda","#923","greek capital letter lambda");
        $sh[]=array("Μ","Mu","#924","greek capital letter mu");
        $sh[]=array("Ν","Nu","#925","greek capital letter nu");
        $sh[]=array("Ξ","Xi","#926","greek capital letter xi");
        $sh[]=array("Ο","Omicron","#927","greek capital letter omicron");
        $sh[]=array("Π","Pi","#928","greek capital letter pi");
        $sh[]=array("Ρ","Rho","#929","greek capital letter rho");
        $sh[]=array("Σ","Sigma","#931","greek capital letter sigma");
        $sh[]=array("Τ","Tau","#932","greek capital letter tau");
        $sh[]=array("Υ","Upsilon","#933","greek capital letter upsilon");
        $sh[]=array("Φ","Phi","#934;","greek capital letter phi");
        $sh[]=array("Χ","Chi","#935","greek capital letter chi");
        $sh[]=array("Ψ","Psi","#936","greek capital letter psi");
        $sh[]=array("Ω","Omega","#937","greek capital letter omega");
        $sh[]=array("α","alpha","#945","greek small letter alpha");
        $sh[]=array("β","beta","#946","greek small letter beta");
        $sh[]=array("γ","gamma","#947","greek small letter gamma");
        $sh[]=array("δ","delta","#948","greek small letter delta");
        $sh[]=array("ε","epsilon","#949","greek small letter epsilon");
        $sh[]=array("ζ","zeta","#950","greek small letter zeta");
        $sh[]=array("η","eta","#951","greek small letter eta");
        $sh[]=array("θ","theta","#952","greek small letter theta");
        $sh[]=array("ι","iota","#953","greek small letter iota");
        $sh[]=array("κ","kappa","#954","greek small letter kappa");
        $sh[]=array("λ","lambda","#955","greek small letter lambda");
        $sh[]=array("μ","mu","#956","greek small letter mu");
        $sh[]=array("ν","nu","#957","greek small letter nu");
        $sh[]=array("ξ","xi","#958","greek small letter xi");
        $sh[]=array("ο","omicron","#959","greek small letter omicron");
        $sh[]=array("π","pi","#960","greek small letter pi");
        $sh[]=array("ρ","rho","#961","greek small letter rho");
        $sh[]=array("ς","sigmaf","#962","greek small letter final sigma");
        $sh[]=array("σ","sigma","#963","greek small letter sigma");
        $sh[]=array("τ","tau","#964","greek small letter tau");
        $sh[]=array("υ","upsilon","#965","greek small letter upsilon");
        $sh[]=array("φ","phi","#966","greek small letter phi");
        $sh[]=array("χ","chi","#967","greek small letter chi");
        $sh[]=array("ψ","psi","#968","greek small letter psi");
        $sh[]=array("ω","omega","#969","greek small letter omega");
        $sh[]=array("ϑ","thetasym","#977","greek small letter theta symbol");
        $sh[]=array("ϒ","upsih","#978","greek upsilon with hook symbol");
        $sh[]=array("ϖ","piv","#982","greek pi symbol");
        $sh[]=array("•","bull","#8226","bullet = black small circle");
        $sh[]=array("…","hellip","#8230","horizontal ellipsis = three dot leader");
        $sh[]=array("′","prime","#8242","prime = minutes = feet");
        $sh[]=array("″","Prime","#8243","double prime = seconds = inches");
        $sh[]=array("‾","oline","#8254","overline = spacing overscore");
        $sh[]=array("⁄","frasl","#8260","fraction slash");
        $sh[]=array("℘","weierp","#8472","script capital P = power set = Weierstrass p");
        $sh[]=array("ℑ","image","#8465","blackletter capital I = imaginary part");
        $sh[]=array("ℜ","real","#8476","blackletter capital R = real part symbol");
        $sh[]=array("™","trade","#8482","trade mark sign");
        $sh[]=array("ℵ","alefsym","#8501","alef symbol = first transfinite cardinal");
        $sh[]=array("←","larr","#8592","leftwards arrow");
        $sh[]=array("↑","uarr","#8593","upwards arrow");
        $sh[]=array("→","rarr","#8594","rightwards arrow");
        $sh[]=array("↓","darr","#8595","downwards arrow");
        $sh[]=array("↔","harr","#8596","left right arrow");
        $sh[]=array("↵","crarr","#8629","downwards arrow with corner leftwards = carriage return");
        $sh[]=array("⇐","lArr","#8656","leftwards double arrow");
        $sh[]=array("⇑","uArr","#8657","upwards double arrow");
        $sh[]=array("⇒","rArr","#8658","rightwards double arrow");
        $sh[]=array("⇓","dArr","#8659","downwards double arrow");
        $sh[]=array("⇔","hArr","#8660","left right double arrow");
        $sh[]=array("∀","forall","#8704","for all");
        $sh[]=array("∂","part","#8706","partial differential");
        $sh[]=array("∃","exist","#8707","there exists");
        $sh[]=array("∅","empty","#8709","empty set = null set = diameter");
        $sh[]=array("∇","nabla","#8711","nabla = backward difference");
        $sh[]=array("∈","isin","#8712","element of");
        $sh[]=array("∉","notin","#8713","not an element of");
        $sh[]=array("∋","ni","#8715","contains as member");
        $sh[]=array("∏","prod","#8719","n-ary product = product sign");
        $sh[]=array("∑","sum","#8721","n-ary sumation");
        $sh[]=array("−","minus","#8722","minus sign");
        $sh[]=array("∗","lowast","#8727","asterisk operator");
        $sh[]=array("√","radic","#8730","square root = radical sign");
        $sh[]=array("∝","prop","#8733","proportional to");
        $sh[]=array("∞","infin","#8734","infinity");
        $sh[]=array("∠","ang","#8736","angle");
        $sh[]=array("∧","and","#8743","logical and = wedge");
        $sh[]=array("∨","or","#8744","logical or = vee");
        $sh[]=array("∩","cap","#8745","intersection = cap");
        $sh[]=array("∪","cup","#8746","union = cup");
        $sh[]=array("∫","int","#8747","integral");
        $sh[]=array("∴","there4","#8756","therefore");
        $sh[]=array("∼","sim","#8764","tilde operator = varies with = similar to");
        $sh[]=array("≅","cong","#8773","approximately equal to");
        $sh[]=array("≈","asymp","#8776","almost equal to = asymptotic to");
        $sh[]=array("≠","ne","#8800","not equal to");
        $sh[]=array("≡","equiv","#8801","identical to");
        $sh[]=array("≤","le","#8804","less-than or equal to");
        $sh[]=array("≥","ge","#8805","greater-than or equal to");
        $sh[]=array("⊂","sub","#8834","subset of");
        $sh[]=array("⊃","sup","#8835","superset of");
        $sh[]=array("⊄","nsub","#8836","not a subset of");
        $sh[]=array("⊆","sube","#8838","subset of or equal to");
        $sh[]=array("⊇","supe","#8839","superset of or equal to");
        $sh[]=array("⊕","oplus","#8853","circled plus = direct sum");
        $sh[]=array("⊗","otimes","#8855","circled times = vector product");
        $sh[]=array("⊥","perp","#8869","up tack = orthogonal to = perpendicular");
        $sh[]=array("⋅","sdot","#8901","dot operator");
        $sh[]=array("⌈","lceil","#8968","left ceiling = apl upstile");
        $sh[]=array("⌉","rceil","#8969","right ceiling");
        $sh[]=array("⌊","lfloor","#8970","left floor = apl downstile");
        $sh[]=array("⌋","rfloor","#8971","right floor");
        $sh[]=array("⟨","lang","#9001","left-pointing angle bracket = bra");
        $sh[]=array("⟩","rang","#9002","right-pointing angle bracket = ket");
        $sh[]=array("◊","loz","#9674","lozenge");
        $sh[]=array("♠","spades","#9824","black spade suit");
        $sh[]=array("♣","clubs","#9827","black club suit = shamrock");
        $sh[]=array("♥","hearts","#9829","black heart suit = valentine");
        $sh[]=array("♦","diams","#9830","black diamond suit");
        $sh[]=array('"',"quot","#34","quotation mark = APL quote");
        $sh[]=array("&","amp","#38","ampersand");
        $sh[]=array("<","lt","#60","less-than sign");
        $sh[]=array(">","gt","#62","greater-than sign");
        $sh[]=array("Œ","OElig","#338","latin capital ligature OE");
        $sh[]=array("œ","oelig","#339","latin small ligature oe");
        $sh[]=array("Š","Scaron","#352","latin capital letter S with caron");
        $sh[]=array("š","scaron","#353","latin small letter s with caron");
        $sh[]=array("Ÿ","Yuml","#376","latin capital letter Y with diaeresis");
        $sh[]=array("ˆ","circ","#710","modifier letter circumflex accent");
        $sh[]=array('',"tilde","#732","small tilde");
        $sh[]=array(" ","ensp","#8194","en space");
        $sh[]=array(" ","emsp","#8195","em space");
        $sh1[]=array(" ","thinsp","#8201","thin space");
        $sh[]=array("–","ndash","#8211","en dash");
        $sh[]=array("—","mdash","#8212","em dash");
        $sh[]=array("‘","lsquo","#8216","left single quotation mark");
        $sh[]=array("’","rsquo","#8217","right single quotation mark");
        $sh[]=array("‚","sbquo","#8218","single low-9 quotation mark");
        $sh[]=array("“","ldquo","#8220","left double quotation mark");
        $sh[]=array("”","rdquo","#8221","right double quotation mark");
        $sh[]=array("„","bdquo","#8222","double low-9 quotation mark");
        $sh[]=array("†","dagger","#8224","dagger");
        $sh[]=array("‡","Dagger","#8225","double dagger");
        $sh[]=array("‰","permil","#8240","per mille sign");
        $sh[]=array("‹","lsaquo","#8249","single left-pointing angle quotation mark");
        $sh[]=array("›","rsaquo","#8250","single right-pointing angle quotation mark");
        $sh[]=array("€","euro","#8364","euro sign");
        $sh[]=array("„","dbquo;","#132;");

        return $sh;
    }
}

 

posted on 2012-07-06 20:37  空山幽泉  阅读(2683)  评论(0编辑  收藏  举报

导航