将Html代码转换为Text
在抓取html页时,需要过滤掉html代码,获取Html源代码中的Text,有正则表达式可以解决这个问题:
VB.NET
C#
提供一简单示例如下:
VB.NET
C#
输出结果为:
VB.NET
''' -----------------------------------------------------------------------------
''' <summary>
''' 移除所有的html标签
''' </summary>
''' <param name="HTML">html代码</param>
''' <returns></returns>
''' <remarks>
''' </remarks>
''' <history>
''' [Administrator] 2004-9-25 Created
''' </history>
''' -----------------------------------------------------------------------------
Public Function ParseTags(ByVal HTML As String) As String
' 使用正则表达式识别并移除所有的html标签,返回过滤掉Html标签的文本
Dim objRegEx As System.Text.RegularExpressions.Regex
Return objRegEx.Replace(HTML, "<[^>]*>", "")
End Function
''' <summary>
''' 移除所有的html标签
''' </summary>
''' <param name="HTML">html代码</param>
''' <returns></returns>
''' <remarks>
''' </remarks>
''' <history>
''' [Administrator] 2004-9-25 Created
''' </history>
''' -----------------------------------------------------------------------------
Public Function ParseTags(ByVal HTML As String) As String
' 使用正则表达式识别并移除所有的html标签,返回过滤掉Html标签的文本
Dim objRegEx As System.Text.RegularExpressions.Regex
Return objRegEx.Replace(HTML, "<[^>]*>", "")
End Function
C#
/// <summary>
/// 移除所有的html标签
/// </summary>
/// <param name="HTML">html源代码</param>
/// <returns></returns>
public string ParseTags(string HTML)
{
return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>", "");
}
/// 移除所有的html标签
/// </summary>
/// <param name="HTML">html源代码</param>
/// <returns></returns>
public string ParseTags(string HTML)
{
return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>", "");
}
VB.NET
Private Sub Page_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
Dim oStringBuilder As System.Text.StringBuilder
oStringBuilder = New System.Text.StringBuilder
oStringBuilder.Append(ControlChars.CrLf + "<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Transitional//EN"">")
oStringBuilder.Append(ControlChars.CrLf + "<HTML>")
oStringBuilder.Append(ControlChars.CrLf + " <HEAD>")
oStringBuilder.Append(ControlChars.CrLf + " <title>WebForm1</title>")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""GENERATOR"" content=""Microsoft Visual Studio .NET 7.1"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""CODE_LANGUAGE"" content=""Visual Basic .NET 7.1"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""vs_defaultClientScript"" content=""JavaScript"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""vs_targetSchema"" content=""http://schemas.microsoft.com/intellisense/ie5"">")
oStringBuilder.Append(ControlChars.CrLf + " </HEAD>")
oStringBuilder.Append(ControlChars.CrLf + " <body MS_POSITIONING=""GridLayout"">")
oStringBuilder.Append(ControlChars.CrLf + " <form id=""Form1"" method=""post"" runat=""server"">")
oStringBuilder.Append(ControlChars.CrLf + " <FONT face=""宋体"">测试</FONT>")
oStringBuilder.Append(ControlChars.CrLf + " </form>")
oStringBuilder.Append(ControlChars.CrLf + " </body>")
oStringBuilder.Append(ControlChars.CrLf + "</HTML>")
Response.Write(ParseTags(oStringBuilder.ToString))
End Sub
Dim oStringBuilder As System.Text.StringBuilder
oStringBuilder = New System.Text.StringBuilder
oStringBuilder.Append(ControlChars.CrLf + "<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Transitional//EN"">")
oStringBuilder.Append(ControlChars.CrLf + "<HTML>")
oStringBuilder.Append(ControlChars.CrLf + " <HEAD>")
oStringBuilder.Append(ControlChars.CrLf + " <title>WebForm1</title>")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""GENERATOR"" content=""Microsoft Visual Studio .NET 7.1"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""CODE_LANGUAGE"" content=""Visual Basic .NET 7.1"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""vs_defaultClientScript"" content=""JavaScript"">")
oStringBuilder.Append(ControlChars.CrLf + " <meta name=""vs_targetSchema"" content=""http://schemas.microsoft.com/intellisense/ie5"">")
oStringBuilder.Append(ControlChars.CrLf + " </HEAD>")
oStringBuilder.Append(ControlChars.CrLf + " <body MS_POSITIONING=""GridLayout"">")
oStringBuilder.Append(ControlChars.CrLf + " <form id=""Form1"" method=""post"" runat=""server"">")
oStringBuilder.Append(ControlChars.CrLf + " <FONT face=""宋体"">测试</FONT>")
oStringBuilder.Append(ControlChars.CrLf + " </form>")
oStringBuilder.Append(ControlChars.CrLf + " </body>")
oStringBuilder.Append(ControlChars.CrLf + "</HTML>")
Response.Write(ParseTags(oStringBuilder.ToString))
End Sub
C#
private void Page_Load(object sender, System.EventArgs e)
{
System.Text.StringBuilder oStringBuilder;
oStringBuilder = new System.Text.StringBuilder();
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "<HTML>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <HEAD>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <title>WebForm1</title>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="GENERATOR" content="Microsoft Visual Studio .NET 7.1">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="CODE_LANGUAGE" content="Visual Basic .NET 7.1">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="vs_defaultClientScript" content="JavaScript">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </HEAD>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <body MS_POSITIONING="GridLayout">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <form id="Form1" method="post" runat="server">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <FONT face="宋体">测试</FONT>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </form>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </body>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "</HTML>");
Response.Write(ParseTags(oStringBuilder.ToString()));
}
{
System.Text.StringBuilder oStringBuilder;
oStringBuilder = new System.Text.StringBuilder();
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "<HTML>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <HEAD>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <title>WebForm1</title>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="GENERATOR" content="Microsoft Visual Studio .NET 7.1">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="CODE_LANGUAGE" content="Visual Basic .NET 7.1">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="vs_defaultClientScript" content="JavaScript">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </HEAD>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <body MS_POSITIONING="GridLayout">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <form id="Form1" method="post" runat="server">");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " <FONT face="宋体">测试</FONT>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </form>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + " </body>");
oStringBuilder.Append(Microsoft.VisualBasic.ControlChars.CrLf + "</HTML>");
Response.Write(ParseTags(oStringBuilder.ToString()));
}
输出结果为:
WebForm1 测试