HtmlParser.NET examples
HtmlParser.NET at sourceforge, the project code is licensed under the Common Public License.
example code:
The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):
example code:
using System;
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
private void button1_Click(object sender, EventArgs e)
{
//we can use the stream to load a html file from the local disk
// or use the uri to load a web page from the internet
//byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
//MemoryStream memsteam = new MemoryStream(htmlBytes);
//InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
//Page page = new Page(input);
//Lexer lex = new Lexer(page);
if (this.textBox1.Text.Length <= 0)
return;
//here I read the html from the textbox
Lexer lexer = new Lexer(this.textBox1.Text);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse(null);
this.treeView1.Nodes.Clear();
this.treeView1.Nodes.Add("root");
TreeNode treeRoot = this.treeView1.Nodes[0];
for (int i = 0; i < htmlNodes.Count; i++)
{
this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
}
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
if (htmlNode == null || treeNode == null) return;
TreeNode current = treeNode;
//current node
if (htmlNode is ITag)
{
ITag tag=(htmlNode as ITag);
if (!tag.IsEndTag())
{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0)
{
if (tag.Attributes["ID"] != null)
nodeString = nodeString + " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";
if (tag.Attributes["CLASS"] != null)
nodeString = nodeString + " { class=\"" + tag.Attributes["CLASS"].ToString() + "\" }";
if (tag.Attributes["STYLE"] != null)
nodeString = nodeString + " { style=\"" + tag.Attributes["STYLE"].ToString() + "\" }";
if (tag.Attributes["HREF"] != null)
nodeString = nodeString + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
}
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);
}
}
//the children nodes
if (htmlNode.Children!=null && htmlNode.Children.Count > 0)
{
this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
}
//the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null)
{
this.RecursionHtmlNode(treeNode, sibling, false);
sibling = sibling.NextSibling;
}
}
}
screen snapshot for the example:using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
private void button1_Click(object sender, EventArgs e)
{
//we can use the stream to load a html file from the local disk
// or use the uri to load a web page from the internet
//byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
//MemoryStream memsteam = new MemoryStream(htmlBytes);
//InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
//Page page = new Page(input);
//Lexer lex = new Lexer(page);
if (this.textBox1.Text.Length <= 0)
return;
//here I read the html from the textbox
Lexer lexer = new Lexer(this.textBox1.Text);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse(null);
this.treeView1.Nodes.Clear();
this.treeView1.Nodes.Add("root");
TreeNode treeRoot = this.treeView1.Nodes[0];
for (int i = 0; i < htmlNodes.Count; i++)
{
this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
}
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
if (htmlNode == null || treeNode == null) return;
TreeNode current = treeNode;
//current node
if (htmlNode is ITag)
{
ITag tag=(htmlNode as ITag);
if (!tag.IsEndTag())
{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0)
{
if (tag.Attributes["ID"] != null)
nodeString = nodeString + " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";
if (tag.Attributes["CLASS"] != null)
nodeString = nodeString + " { class=\"" + tag.Attributes["CLASS"].ToString() + "\" }";
if (tag.Attributes["STYLE"] != null)
nodeString = nodeString + " { style=\"" + tag.Attributes["STYLE"].ToString() + "\" }";
if (tag.Attributes["HREF"] != null)
nodeString = nodeString + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
}
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);
}
}
//the children nodes
if (htmlNode.Children!=null && htmlNode.Children.Count > 0)
{
this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
}
//the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null)
{
this.RecursionHtmlNode(treeNode, sibling, false);
sibling = sibling.NextSibling;
}
}
}
The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):