简简单单C#爬虫小计

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace 正则
{
    class Program
    {
        static void Main(string[] args)
        {
            string url = "http://www.admin5.com/browse/177/";
            string html = GetHtml(url, Encoding.UTF8);
            Regex r = new Regex("(?<=href=\").*?(?=\")");
            MatchCollection mc = r.Matches(html);
            int a = 1;
            foreach (Match m in mc)
            {
                if (m.Value.Contains("article"))
                {
                    Console.WriteLine("http://www.admin5.com/" + m.Value);
                    Console.WriteLine("抓取内容");
                    string content = GetHtml(m.Value, Encoding.UTF8);
                    Regex i = new Regex("(?<=title>).*?(?=</title>)");
                    MatchCollection mm = i.Matches(content);
                    Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
                    MatchCollection nr = rcontent.Matches(content);
                    string title = mm[0].Value;
                    string neirong = nr[0].Value;
                    Console.WriteLine("保存数据");
                    string path = Directory.GetCurrentDirectory();
                    if (!Directory.Exists(path + "\\data"))
                    {
                        Directory.CreateDirectory(path + "\\data");
                    }
                    File.WriteAllText(path + "\\data" + "\\" + a + ".txt", title + "\r\n" + neirong);
                    a++;
                    Console.WriteLine("保存成功");
                }
            }
            Console.WriteLine("ok");
            Console.ReadKey();
        }

        private static string GetHtml(string url, Encoding encoding)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream s = response.GetResponseStream();
            StreamReader sr = new StreamReader(s);
            return sr.ReadToEnd();
        }
    }
}

  

 

posted @ 2015-09-04 12:06  hexd  阅读(263)  评论(1编辑  收藏  举报