菜鸟练习C#htmlparser----C#正则加htmlDOM进行网页解析腾讯新闻帖子列表相关信息提取 - 笨笨在成长

http://blog.csdn.net/finallyliuyu/archive/2009/05/06/4156071.aspx

适用网站有

7.奇闻异事：http://news.qq.com/newssh/qwqs/qwqs3j.htm（简单无其他连接）

<tr><td>·<a target="_blank" class="rlk1" href="/a/20090504/000868.htm">组图：地球上最像外星的九大地方</a>(2009年05月04日11:17)

</td></tr>

时间

8.社会观察：http://news.qq.com/newssh/sh-shgc3jy.htm（同上）
tr><td>·<a target="_blank" class="rlk1" href="/a/20090427/000359.htm">齐鲁晚报：“三鹿”之后为何仍有“晨园”</a>(2009年04月27日08:01)

9.新闻云南http://news.qq.com/yn/ywy.htm（同上）
<tr><td>· <a target="_blank" href="/a/20090407/000619.htm">云铜腐案追踪：国资领导机制存巨大腐败漏洞</a>　(04/07 09:22)

</td></tr>

10.冷暖人间http://news.qq.com/newssh/sh-lnrj3jy.htm(同上)

<tr><td>·<a target="_blank" class="rlk1" href="/a/20090504/000525.htm">打工女子患肾衰竭希望捐眼角膜送他人光明</a>(2009年05月04日09:09)

</td></tr>

11.滚动新闻 http://news.qq.com/scroll/now.htm（同上）

14.港澳台http://news.qq.com/newsgn/x_gat3.htm(同上)

<tr><td>·<a target="_blank" class="rlk1" href="/a/20090502/001016.htm">香港万余人参加佛典佛教联合会长谈流感(图)</a>(2009年05月02日21:17)

15. 国内评论http://news.qq.com/newsgn/gnplgdy.htm（同上）
16. 社会万象http://news.qq.com/newssh/shwx_3jy.htm(同上)

<tr><td class="text01">·<a target="_blank" href="/a/20090504/001612.htm">一个白领的“五四”青年节</a>(04日

19:41)

</td></tr>

代码说明：

1.提上面得网页信息完全可以用一个正则搞定。我是为了练习C#htmlparser的使用才搞的这么麻烦的。(项目需要涉及htmlDOM)

2.代码注释不祥很抱歉。这个代码要装C#hmlparser库才能运行（见我博客的上一篇文章）

3.我会把可执行代码放到我的csdn资源空间大家可以下载（绝对能够正常运行）但是再次为我不规范的代码给大家阅读上造成的困难道歉。

4.代码中调用htmlparser之前有一些用正则对字符串进行预处理的环节。这是因为<TD><A></A></TD>。htmlparser能够识别<A>是<TD>的孩子。但是要<TD>(任意字符，空白或回车)<A></A></TD>则htmlparser就识别不出来了。它的智能实在太差啦！！！

using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Diagnostics;
using System.Text.RegularExpressions;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Visitors;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Lex;

namespace ListRetrival
{
 class Program
 {
 static void Main(string[] args)
 {
 string url = "http://news.qq.com/newsgn/x_gat3.htm";
 string rawtext = GetDataFromUrl(url);
 //StreamReader sr=new StreamReader("C:\\temp.txt",Encoding.GetEncoding("gb2312"));
 //string rawtext=sr.ReadToEnd();
 //sr.Close();
 //文本预处理
 Regex regex = new Regex(@"<tr>\s+<td>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
 rawtext=regex.Replace(rawtext,@"<tr><td>");
 rawtext = new Regex(@"<td(.*?)>\s*·\s*<a", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "<td${1}><a");
 rawtext = new Regex(@" ", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");
 rawtext = new Regex(@"<(?<first>.*?)>\s+<(?<second>.*?)>", RegexOptions.IgnoreCase | RegexOptions.Multiline).Replace(rawtext, "<${first}><${second}>");

// Console.Write(rawtext);
 //StreamWriter sw = new StreamWriter("C:\\finally.txt", true, Encoding.GetEncoding("gb2312"));
 // sw.Write(rawtext);
 //sw.Close();
 //Console.Write(rawtext);
 Lexer lexer = new Lexer(rawtext);
 Parser parser = new Parser(lexer);
 NodeFilter filter = new TagNameFilter("TR");
 NodeList htmlNodes = parser.Parse(filter);
 NodeList pure = new NodeList();
 for (int j = 0; j < htmlNodes.Count; j++)
 {
 if (htmlNodes[j].FirstChild.FirstChild != null)
 if (htmlNodes[j].FirstChild.FirstChild.GetType().ToString() == "Winista.Text.HtmlParser.Tags.ATag")
 pure.Add(htmlNodes[j]);
 }
 string pureafter = pure.ToHtml();
 //再一步去纯
 Lexer lexer2 = new Lexer(pureafter);
 Parser parser2 = new Parser(lexer2);
 NodeFilter filter2 = new TagNameFilter("TR");
 NodeList htmlNodes2 = parser2.Parse(filter2);
 NodeList final = new NodeList();
 for (int i = 0; i<htmlNodes2.Count; i++)
 {
 if (htmlNodes2[i].FirstChild.FirstChild.NextSibling != null)
 if (htmlNodes2[i].FirstChild.FirstChild.NextSibling.GetType().ToString() == "Winista.Text.HtmlParser.Tags.Span")
 final.Add(htmlNodes2[i]);

 }
 // Console.Write(final.ToHtml());
 if (final.Count == 0)
 {
 Console.Write("时间不是格式");
 }
 else
 { List<String> infolist=new List<string>();
 string finalstr = final.ToHtml();
 //StreamWriter sw = new StreamWriter("C:\\finally.txt", true, Encoding.GetEncoding("gb2312"));
 //sw.Write(finalstr);
 //sw.Close();
 Regex keyregex = new Regex(@"<a.*?href=""(.*?)"">(.*?)</a><span.*?>(.*?)");
 MatchCollection mycollection = keyregex.Matches(finalstr);
 foreach (Match mymatch in mycollection)
 {
 infolist.Add(mymatch.Groups[1].ToString() + mymatch.Groups[2].ToString() + mymatch.Groups[3].ToString());
 }
 string keyinfo = "";
 for (int k = 0; k < infolist.Count; k++)
 {
 Console.WriteLine(infolist[k]);
 keyinfo += infolist[k];
 }
 StreamWriter sw = new StreamWriter("C:\\finally.txt", true, Encoding.GetEncoding("gb2312"));
 sw.Write(keyinfo);
 sw.Close();


 }

           //Console.WriteLine("finish!");

Console.ReadLine();

}

        public static string GetDataFromUrl(string url)
        {
            string str = string.Empty;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            //设置Http头；
            request.AllowAutoRedirect = true;
            request.AllowWriteStreamBuffering = true;
            request.Referer = "";
            request.Timeout = 10 * 1000;
            request.UserAgent = "";
            HttpWebResponse response = null;
            try
            {
                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK)
                {
                    //根据http应答头来判别编码
                    string Characterset = response.CharacterSet;
                    Encoding encode;
                    if (Characterset != "")
                    {
                        if (Characterset == "ISO-8859-1")
                        {
                            Characterset = "gb2312";
                        }
                        encode = Encoding.GetEncoding(Characterset);
                    }
                    else
                    {
                        encode = Encoding.Default;
                    }
                    //声明一个内存流来贮存http应答流
                    Stream Receivestream = response.GetResponseStream();
                    MemoryStream mstream = new MemoryStream();
                    byte[] bf = new byte[255];
                    int count = Receivestream.Read(bf, 0, 255);
                    while (count > 0)
                    {
                        mstream.Write(bf, 0, count);
                        count = Receivestream.Read(bf, 0, 255);
                    }
                    Receivestream.Close();
                    mstream.Seek(0, SeekOrigin.Begin);
                    //从内存流里读取字符串这里涉及到了编码方案
                    StreamReader reader = new StreamReader(mstream, encode);
                    char[] buf = new char[1024];
                    count = reader.Read(buf, 0, 1024);
                    while (count > 0)
                    {
                        str += new string(buf, 0, 1024);
                        count = reader.Read(buf, 0, 1024);

                    }
                    reader.Close();
                    mstream.Close();

}

            }
            catch (Exception ex)
            {
                Trace.TraceError(ex.ToString());

            }
            finally
            {
                if (response != null)
                    response.Close();
            }

return str;
}

}
}

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/finallyliuyu/archive/2009/05/06/4156071.aspx

发表于 2010-12-22 18:23 笨笨在成长阅读(3149) 评论(2) 编辑收藏举报