几个C#关于Html解析的类

命名空间里有两个类：1.HtmlUtil;2.Htmlpage。分别引用的别人的写好的类，其中也做了不少改变。一开始是用HtmlUtil解析网页，它使用正则表达式解析HTML网页，后来发现某些情况下解析的不是太好。后来在sourceforge里面发现了MLIHTML拿过来用了一下，还不错。

using System;
2

using System.Collections.Generic;
3

using System.Text;
4

using System.Text.RegularExpressions;
5

using MIL.Html;
6

namespace Yuanso.Sitework.Crawler
8

{
9

public class HtmlUtil
10

{
11

/// <summary>
12

/// Written: [CHINA] Zhang Liu
13

/// Date: 1,Jun,2006
14

/// Version: 1.0
15

/// Support: MYBASK <see cref="http://www.mybask.net"/>
16

/// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
17

/// Summary:
18

/// Picking up text content from a html document. This function will remove:
19

/// 1. <%=%>
20

/// 2. script
21

/// 3. style
22

/// 4. html tags
23

/// 6.   and others
24

/// 7. html comments
25

/// After all above removed, \r\n will be replaced by an empty character.
26

/// </summary>
27

/// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
28

/// <returns>string: Stripped text</returns>
29

public static string ExtractContent(string strHtml)
30

{
31

//All the regular expression for matching html, javascript, style elements and others.
32

string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
33

@"&#(\d+);", @"-->", @"<!--.*\n"};
35

//Corresponding replacment to the regular expressions.
36

//string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
37

string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
38

string strStripped = strHtml;
39

//Loop to replacing.
40

for (int i = 0; i < aryRegex.Length; i++)
41

{
42

Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
43

strStripped = regex.Replace(strStripped, aryReplacment[i]);
44

}
45

//Replace "\r\n" to an empty character.
46

strStripped.Replace("\r\n", "");
47

strStripped.Replace("\t", "");
48

//Return stripped string.
49

return strStripped;
50

}
51

public static string ExtractTitle(string strHtml)
52

{
53

string title;
55

//string titleResult;
56

Match m;
57

string titlePatern = @"<title[^>]*?>.*?</title>";
58

Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
59

m = regex.Match(strHtml);
60

if (m.Success)
61

{
62

title = m.Value.ToString();
63

title = title.Replace("<title>", "");
64

title = title.Replace("</title>", "");
65

}
66

else title = "无标题";
67

return title;
69

}
70

/// <summary>
71

/// 此私有方法从一段HTML文本中提取出一定字数的纯文本
72

/// </summary>
73

/// <param name="instr">HTML代码</param>
74

/// <param name="firstN">提取从头数多少个字</param>
75

/// <param name="withLink">是否要链接里面的字</param>
76

/// <returns>纯文本</returns>
77

public static string getFirstNchar(string instr, int firstN, bool withLink)
78

{
79

string strStripped;
80

strStripped = instr.Clone() as string;
81

strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
82

strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
83

strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
84

if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
85

Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
86

strStripped = objReg.Replace(strStripped, "");
87

Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
88

strStripped = objReg2.Replace(strStripped, " ");
89

//return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
90

return strStripped;
91

}
92

public static string getTitle(string strHtml)
94

{
95

string title="";
96

Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
97

Match mc = reg.Match(strHtml);
98

if (mc.Success)
99

title = mc.Groups["title"].Value.Trim();
100

101

return title;
102

}
103

}
104

public class Htmlpage
105

{
106

public static string GetTitle(string strHtml)
107

{
108

MIL.Html.HtmlDocument documnet;
109

HtmlParser parser = new HtmlDomainTreeParser();
110

documnet = parser.Parse(strHtml);
111

StringBuilder text = new StringBuilder("");
112

foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113

{
114

115

HtmlText textNode;
116

textNode = (HtmlText)node;
117

if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))
118

{
119

text.Append(textNode.Text);
120

break;
121

}
122

123

}
124

return text.ToString();
125

126

}
127

public static string GetContent(string strHtml)
128

{
129

MIL.Html.HtmlDocument documnet;
130

HtmlParser parser = new HtmlDomainTreeParser();
131

documnet = parser.Parse(strHtml);
132

StringBuilder text = new StringBuilder();
133

foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134

{
135

136

HtmlText textNode;
137

textNode = (HtmlText)node;
138

if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
139

continue;
140

else text.Append(textNode.Text);
141

142

}
143

return text.ToString();
144

145

}
146

}
147

148

}
149

posted @ 2007-08-01 13:20 Jadepark 阅读(11126) 评论(43) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

思考搜索

思考搜索

几个C#关于Html解析的类

公告