几个C#关于Html解析的类
命名空间里有两个类:1.HtmlUtil;2.Htmlpage。分别引用的别人的写好的类,其中也做了不少改变。一开始是用HtmlUtil解析网页,它使用正则表达式解析HTML网页,后来发现某些情况下解析的不是太好。后来在sourceforge里面发现了MLIHTML拿过来用了一下,还不错。
1using System;
2using System.Collections.Generic;
3using System.Text;
4using System.Text.RegularExpressions;
5using MIL.Html;
6
7namespace Yuanso.Sitework.Crawler
8{
9 public class HtmlUtil
10 {
11 /// <summary>
12 /// Written: [CHINA] Zhang Liu
13 /// Date: 1,Jun,2006
14 /// Version: 1.0
15 /// Support: MYBASK <see cref="http://www.mybask.net"/>
16 /// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
17 /// Summary:
18 /// Picking up text content from a html document. This function will remove:
19 /// 1. <%=%>
20 /// 2. script
21 /// 3. style
22 /// 4. html tags
23 /// 6. and others
24 /// 7. html comments
25 /// After all above removed, \r\n will be replaced by an empty character.
26 /// </summary>
27 /// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
28 /// <returns>string: Stripped text</returns>
29 public static string ExtractContent(string strHtml)
30 {
31 //All the regular expression for matching html, javascript, style elements and others.
32 string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
33 @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",
34 @"&#(\d+);", @"-->", @"<!--.*\n"};
35 //Corresponding replacment to the regular expressions.
36 //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
37 string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
38 string strStripped = strHtml;
39 //Loop to replacing.
40 for (int i = 0; i < aryRegex.Length; i++)
41 {
42 Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
43 strStripped = regex.Replace(strStripped, aryReplacment[i]);
44 }
45 //Replace "\r\n" to an empty character.
46 strStripped.Replace("\r\n", "");
47 strStripped.Replace("\t", "");
48 //Return stripped string.
49 return strStripped;
50 }
51 public static string ExtractTitle(string strHtml)
52 {
53
54 string title;
55 //string titleResult;
56 Match m;
57 string titlePatern = @"<title[^>]*?>.*?</title>";
58 Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
59 m = regex.Match(strHtml);
60 if (m.Success)
61 {
62 title = m.Value.ToString();
63 title = title.Replace("<title>", "");
64 title = title.Replace("</title>", "");
65 }
66 else title = "无标题";
67
68 return title;
69 }
70 /// <summary>
71 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
72 /// </summary>
73 /// <param name="instr">HTML代码</param>
74 /// <param name="firstN">提取从头数多少个字</param>
75 /// <param name="withLink">是否要链接里面的字</param>
76 /// <returns>纯文本</returns>
77 public static string getFirstNchar(string instr, int firstN, bool withLink)
78 {
79 string strStripped;
80 strStripped = instr.Clone() as string;
81 strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
82 strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
83 strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
84 if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
85 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
86 strStripped = objReg.Replace(strStripped, "");
87 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
88 strStripped = objReg2.Replace(strStripped, " ");
89 //return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
90 return strStripped;
91 }
92
93 public static string getTitle(string strHtml)
94 {
95 string title="";
96 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
97 Match mc = reg.Match(strHtml);
98 if (mc.Success)
99 title = mc.Groups["title"].Value.Trim();
100
101 return title;
102 }
103 }
104 public class Htmlpage
105 {
106 public static string GetTitle(string strHtml)
107 {
108 MIL.Html.HtmlDocument documnet;
109 HtmlParser parser = new HtmlDomainTreeParser();
110 documnet = parser.Parse(strHtml);
111 StringBuilder text = new StringBuilder("");
112 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113 {
114
115 HtmlText textNode;
116 textNode = (HtmlText)node;
117 if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))
118 {
119 text.Append(textNode.Text);
120 break;
121 }
122
123 }
124 return text.ToString();
125
126 }
127 public static string GetContent(string strHtml)
128 {
129 MIL.Html.HtmlDocument documnet;
130 HtmlParser parser = new HtmlDomainTreeParser();
131 documnet = parser.Parse(strHtml);
132 StringBuilder text = new StringBuilder();
133 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134 {
135
136 HtmlText textNode;
137 textNode = (HtmlText)node;
138 if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
139 continue;
140 else text.Append(textNode.Text);
141
142 }
143 return text.ToString();
144
145 }
146 }
147
148}
149
2using System.Collections.Generic;
3using System.Text;
4using System.Text.RegularExpressions;
5using MIL.Html;
6
7namespace Yuanso.Sitework.Crawler
8{
9 public class HtmlUtil
10 {
11 /// <summary>
12 /// Written: [CHINA] Zhang Liu
13 /// Date: 1,Jun,2006
14 /// Version: 1.0
15 /// Support: MYBASK <see cref="http://www.mybask.net"/>
16 /// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
17 /// Summary:
18 /// Picking up text content from a html document. This function will remove:
19 /// 1. <%=%>
20 /// 2. script
21 /// 3. style
22 /// 4. html tags
23 /// 6. and others
24 /// 7. html comments
25 /// After all above removed, \r\n will be replaced by an empty character.
26 /// </summary>
27 /// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
28 /// <returns>string: Stripped text</returns>
29 public static string ExtractContent(string strHtml)
30 {
31 //All the regular expression for matching html, javascript, style elements and others.
32 string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
33 @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",
34 @"&#(\d+);", @"-->", @"<!--.*\n"};
35 //Corresponding replacment to the regular expressions.
36 //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
37 string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
38 string strStripped = strHtml;
39 //Loop to replacing.
40 for (int i = 0; i < aryRegex.Length; i++)
41 {
42 Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
43 strStripped = regex.Replace(strStripped, aryReplacment[i]);
44 }
45 //Replace "\r\n" to an empty character.
46 strStripped.Replace("\r\n", "");
47 strStripped.Replace("\t", "");
48 //Return stripped string.
49 return strStripped;
50 }
51 public static string ExtractTitle(string strHtml)
52 {
53
54 string title;
55 //string titleResult;
56 Match m;
57 string titlePatern = @"<title[^>]*?>.*?</title>";
58 Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
59 m = regex.Match(strHtml);
60 if (m.Success)
61 {
62 title = m.Value.ToString();
63 title = title.Replace("<title>", "");
64 title = title.Replace("</title>", "");
65 }
66 else title = "无标题";
67
68 return title;
69 }
70 /// <summary>
71 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
72 /// </summary>
73 /// <param name="instr">HTML代码</param>
74 /// <param name="firstN">提取从头数多少个字</param>
75 /// <param name="withLink">是否要链接里面的字</param>
76 /// <returns>纯文本</returns>
77 public static string getFirstNchar(string instr, int firstN, bool withLink)
78 {
79 string strStripped;
80 strStripped = instr.Clone() as string;
81 strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
82 strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
83 strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
84 if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
85 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
86 strStripped = objReg.Replace(strStripped, "");
87 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
88 strStripped = objReg2.Replace(strStripped, " ");
89 //return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
90 return strStripped;
91 }
92
93 public static string getTitle(string strHtml)
94 {
95 string title="";
96 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
97 Match mc = reg.Match(strHtml);
98 if (mc.Success)
99 title = mc.Groups["title"].Value.Trim();
100
101 return title;
102 }
103 }
104 public class Htmlpage
105 {
106 public static string GetTitle(string strHtml)
107 {
108 MIL.Html.HtmlDocument documnet;
109 HtmlParser parser = new HtmlDomainTreeParser();
110 documnet = parser.Parse(strHtml);
111 StringBuilder text = new StringBuilder("");
112 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113 {
114
115 HtmlText textNode;
116 textNode = (HtmlText)node;
117 if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))
118 {
119 text.Append(textNode.Text);
120 break;
121 }
122
123 }
124 return text.ToString();
125
126 }
127 public static string GetContent(string strHtml)
128 {
129 MIL.Html.HtmlDocument documnet;
130 HtmlParser parser = new HtmlDomainTreeParser();
131 documnet = parser.Parse(strHtml);
132 StringBuilder text = new StringBuilder();
133 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134 {
135
136 HtmlText textNode;
137 textNode = (HtmlText)node;
138 if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
139 continue;
140 else text.Append(textNode.Text);
141
142 }
143 return text.ToString();
144
145 }
146 }
147
148}
149