想看小说,自己写个采集类,读网页文章写入txt文件

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace allen
{
    class Program
    {
        /// <summary>
        /// 根据网址取得HTML代码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        static string GetHtml(string url)
        {
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            Stream stream = response.GetResponseStream();
            StreamReader reader = new StreamReader(stream, Encoding.Default);
            string html = reader.ReadToEnd();
            stream.Close();
            return html;
        }
        static Regex reg;
        /// <summary>
        /// 过滤器,留下文章正文
        /// </summary>
        /// <param name="htmlStr"></param>
        /// <returns></returns>
        static string MyFilter(string htmlStr)
        {
            reg = new Regex(@"\s+");//先把任意空白符做掉
            htmlStr = reg.Replace(htmlStr, "");
            reg = new Regex("点此下载封神演义.txt</font></font></a></div></td>.*</div></td></tr><tr><tdclass=");//匹配出正文
            Match match = reg.Match(htmlStr);
            string result = match.Value;
            result = result.Replace("点此下载封神演义.txt</font></font></a></div></td>", "");
            result = result.Replace("</div></td></tr><tr><tdclass=","");
            result = result.Replace("</tr></table>", "");
            result = result.Replace("本文章下载于www.Txt66.com", "");
            result = result.Replace("<br>",Environment.NewLine);
            return result;
        }
        /// <summary>
        /// 循环读取每页的文章,写入记事本
        /// </summary>
        static void WriteFile()
        {
            int page_num = 1;
            string url = "http://www.txt66.com/read2.asp?id=8480&PageNum={0}";
            string url_temp = string.Empty;
            string html = string.Empty;
            string text = string.Empty;
            StreamWriter sw = new StreamWriter(@"F:\g.txt", true, Encoding.Unicode);
            while (page_num < 124)
            {
                url_temp = string.Format(url, page_num);
                html = GetHtml(url_temp);
                text = MyFilter(html);
                sw.Write(text);
                Console.WriteLine("写入第{0}页", page_num);
                System.Threading.Thread.Sleep(600);
                page_num++;
            }
            sw.Close();
        }
        /// <summary>
        /// 主函数
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            WriteFile();
            Console.ReadKey();
        }
    }
}

posted @ 2010-03-06 14:31  liulun  阅读(1190)  评论(5编辑  收藏  举报