提取HTML代码中文字的C#函数

/// <summary>
/// 去除HTML标记
/// </summary>
/// <param name="strHtml">包括HTML的源码 </param>
/// <returns>已经去除后的文字</returns>
public static string StripHTML(string strHtml)
{
string [] aryReg ={
          @"<script[^>]*?>.*?</script>",
          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
          @"([\r\n])[\s]+",
          @"&(quot|#34);",
          @"&(amp|#38);",
          @"&(lt|#60);",
          @"&(gt|#62);",
          @"&(nbsp|#160);",
          @"&(iexcl|#161);",
          @"&(cent|#162);",
          @"&(pound|#163);",
          @"&(copy|#169);",
          @"&#(\d+);",
          @"-->",
          @"<!--.*\n"
        };
string [] aryRep = {
          "",
          "",
          "",
          "\"",
          "&",
          "<",
          ">",
          " ",
          "\xa1",//chr(161),
          "\xa2",//chr(162),
          "\xa3",//chr(163),
          "\xa9",//chr(169),
          "",
          "\r\n",
          ""
          };
string newReg =aryReg[0];
string strOutput=strHtml;
for(int i = 0;i<aryReg.Length;i++)
{
    Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
    strOutput = regex.Replace(strOutput,aryRep[i]);
}
strOutput.Replace("<","");
strOutput.Replace(">","");
strOutput.Replace("\r\n","");
return strOutput;
}

posted @ 2008-07-03 11:09 superfang 阅读(284) 评论(0) 编辑收藏举报

快乐程序员

编程的快乐来自过程而非结果。

提取HTML代码中文字的C#函数

公告