提取HTML代码中文字的C#函数

/// <summary>
  /// 去除HTML标记
  /// </summary>
  /// <param name=&quot;strHtml&quot;>包括HTML的源码 </param>
  /// <returns>已经去除后的文字</returns>
  public static string StripHTML(string strHtml)
  {
  string [] aryReg ={
          @&quot;<script[^>]*?>.*?</script>&quot;,
          @&quot;<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([&quot;&quot;'])(\\[&quot;&quot;'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>&quot;,
          @&quot;([\r\n])[\s]+&quot;,
          @&quot;&amp;(quot|#34);&quot;,
          @&quot;&amp;(amp|#38);&quot;,
          @&quot;&amp;(lt|#60);&quot;,
          @&quot;&amp;(gt|#62);&quot;,
          @&quot;&amp;(nbsp|#160);&quot;,
          @&quot;&amp;(iexcl|#161);&quot;,
          @&quot;&amp;(cent|#162);&quot;,
          @&quot;&amp;(pound|#163);&quot;,
          @&quot;&amp;(copy|#169);&quot;,
          @&quot;&amp;#(\d+);&quot;,
          @&quot;-->&quot;,
          @&quot;<!--.*\n&quot;
        };
  string [] aryRep = {
          &quot;&quot;,
          &quot;&quot;,
          &quot;&quot;,
          &quot;\&quot;&quot;,
          &quot;&amp;&quot;,
          &quot;<&quot;,
          &quot;>&quot;,
          &quot; &quot;,
          &quot;\xa1&quot;,//chr(161),
          &quot;\xa2&quot;,//chr(162),
          &quot;\xa3&quot;,//chr(163),
          &quot;\xa9&quot;,//chr(169),
          &quot;&quot;,
          &quot;\r\n&quot;,
          &quot;&quot;
          };
  string newReg =aryReg[0];
  string strOutput=strHtml;
  for(int i = 0;i<aryReg.Length;i++)
  {
    Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
    strOutput = regex.Replace(strOutput,aryRep[i]);
  }
  strOutput.Replace(&quot;<&quot;,&quot;&quot;);
  strOutput.Replace(&quot;>&quot;,&quot;&quot;);
  strOutput.Replace(&quot;\r\n&quot;,&quot;&quot;);
  return strOutput;
  }

posted @ 2008-07-03 11:09  superfang  阅读(284)  评论(0编辑  收藏  举报