将Html代码转换为Text

在抓取html页时,需要过滤掉html代码,获取Html源代码中的Text,有正则表达式可以解决这个问题:
VB.NET
    ''' -----------------------------------------------------------------------------
    ''' <summary>
    ''' 移除所有的html标签
    ''' </summary>
    ''' <param name="HTML">html代码</param>
    ''' <returns></returns>
    ''' <remarks>
    ''' </remarks>
    ''' <history>
    '''     [Administrator]    2004-9-25    Created
    ''' </history>
    ''' -----------------------------------------------------------------------------
    Public Function ParseTags(ByVal HTML As StringAs String
        
' 使用正则表达式识别并移除所有的html标签,返回过滤掉Html标签的文本
        Dim objRegEx As System.Text.RegularExpressions.Regex
        
Return objRegEx.Replace(HTML, "<[^>]*>""")
    
End Function

C#
        /// <summary>
        
/// 移除所有的html标签
        
/// </summary>
        
/// <param name="HTML">html源代码</param>
        
/// <returns></returns>

        public string ParseTags(string HTML) 
        

            
return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>"""); 
        }
提供一简单示例如下:
VB.NET
    Private Sub Page_Load(ByVal sender As System.ObjectByVal e As System.EventArgs) Handles MyBase.Load
        
Dim oStringBuilder As System.Text.StringBuilder

        oStringBuilder 
= New System.Text.StringBuilder
        oStringBuilder.Append(ControlChars.CrLf 
+ "<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Transitional//EN"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "<HTML>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "    <HEAD>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <title>WebForm1</title>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <meta name=""GENERATOR"" content=""Microsoft Visual Studio .NET 7.1"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <meta name=""CODE_LANGUAGE"" content=""Visual Basic .NET 7.1"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <meta name=""vs_defaultClientScript"" content=""JavaScript"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <meta name=""vs_targetSchema"" content=""http://schemas.microsoft.com/intellisense/ie5"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "    </HEAD>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "    <body MS_POSITIONING=""GridLayout"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        <form id=""Form1"" method=""post"" runat=""server"">")
        oStringBuilder.Append(ControlChars.CrLf 
+ "            <FONT face=""宋体"">测试</FONT>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "        </form>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "    </body>")
        oStringBuilder.Append(ControlChars.CrLf 
+ "</HTML>")
        Response.
Write(ParseTags(oStringBuilder.ToString))
    
End Sub

C#
        private void Page_Load(object sender, System.EventArgs e)
        
{
            System.Text.StringBuilder oStringBuilder; 
            oStringBuilder 
= new System.Text.StringBuilder(); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "<HTML>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "  <HEAD>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <title>WebForm1</title>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <meta name="GENERATOR" content="Microsoft Visual Studio .NET 7.1">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <meta name="CODE_LANGUAGE" content="Visual Basic .NET 7.1">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <meta name="vs_defaultClientScript" content="JavaScript">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "  </HEAD>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "  <body MS_POSITIONING="GridLayout">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    <form id="Form1" method="post" runat="server">"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "      <FONT face="宋体">测试</FONT>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "    </form>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "  </body>"); 
            oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf 
+ "</HTML>"); 
            Response.Write(ParseTags(oStringBuilder.ToString()));
        }

输出结果为:
WebForm1 测试 

posted on 2004-09-25 11:20  小牛哥  阅读(5500)  评论(10编辑  收藏  举报

导航