Html2Text C# Source
using System;
public class Html2Text
{
public static void Main(string[] args)
{
if (args.Length < 1)
{
Console.WriteLine("usage: Htm2Txt xxxx.htm [target.txt]");
}
else
{
Console.WriteLine(StripHTML("<html><head><title>测试</title></head><body>哈哈</body></html>"));
}
char name1=Convert.ToChar("");
Console.WriteLine("'" + name1 + "'");
}
private static string StripHTML(string source)
{
try
{
string result;
// Remove HTML Development formatting
result = source.Replace("\r", " "); // Replace line breaks with space because browsers inserts space
result = result.Replace("\n", " "); // Replace line breaks with space because browsers inserts space
result = result.Replace("\t", string.Empty); // Remove step-formatting
result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove repeating speces becuase browsers ignore them
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>","<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)","</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>","<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)","</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>","<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)","</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>","\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, @" "," ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"•"," * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"‹","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"›",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"™","(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"⁄","/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @">",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"©","(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"®","(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
//System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("\n", "\r");
// Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)","\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)","\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab
string breaks = "\r\r\r"; // Initial replacement target string for linebreaks
string tabs = "\t\t\t\t\t"; // Initial replacement target string for tabs
for (int index=0; index<result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
return result;
}
catch
{
}
return source;
}
};
public class Html2Text
{
public static void Main(string[] args)
{
if (args.Length < 1)
{
Console.WriteLine("usage: Htm2Txt xxxx.htm [target.txt]");
}
else
{
Console.WriteLine(StripHTML("<html><head><title>测试</title></head><body>哈哈</body></html>"));
}
char name1=Convert.ToChar("");
Console.WriteLine("'" + name1 + "'");
}
private static string StripHTML(string source)
{
try
{
string result;
// Remove HTML Development formatting
result = source.Replace("\r", " "); // Replace line breaks with space because browsers inserts space
result = result.Replace("\n", " "); // Replace line breaks with space because browsers inserts space
result = result.Replace("\t", string.Empty); // Remove step-formatting
result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove repeating speces becuase browsers ignore them
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>","<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)","</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>","<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)","</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>","<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)","</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>","\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, @" "," ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"•"," * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"‹","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"›",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"™","(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"⁄","/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"<","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @">",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"©","(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @"®","(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
//System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("\n", "\r");
// Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)","\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)","\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab
string breaks = "\r\r\r"; // Initial replacement target string for linebreaks
string tabs = "\t\t\t\t\t"; // Initial replacement target string for tabs
for (int index=0; index<result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
return result;
}
catch
{
}
return source;
}
};