佛山软件定制

正则表达式实例:取得普陀区所有的小区名字和地址

程序就是个好东西,人很难完成的任务,它只需很短时间就搞定。

下面我们来采集一个房产网站上的所有普陀区的小区列表

改地址为:http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA

{0}为页码,共35页,C#实现代码如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

namespace Hourse
{
    class Program
    {
        private static string uri;
        private static string file;
        static void Main(string[] args)
        {
            uri = "http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA";
            file = AppDomain.CurrentDomain.BaseDirectory + "data.txt";

            if (!File.Exists(file)) File.Create(file);
            Console.WriteLine("--------------------------");
            Console.WriteLine("开始采集数据,请等待...");
            Console.WriteLine("--------------------------");
            int pages = 35;
            int counts = 0;
            for (int i = 1; i <= pages; i++)
            {
                counts += OperateInfo(i);
            }
            Console.WriteLine("采集完成!共"+counts+"条,文件存放在"+file);
            Console.ReadKey();
        }
        static int OperateInfo(int page)
        {
            string _uri = uri.Replace("{0}", page.ToString());
            
            WebClient client = new WebClient();
            byte[] datas= client.DownloadData(_uri);
            string txt = Encoding.UTF8.GetString(datas);
            
            /*
            string txt=@"
            <div class=""fsize14 margin-bottom8"">
                            	<strong>
                            	<a href=""/xiaoqu-4796-%E6%9B%B9%E6%9D%A8%E4%BA%8C%E6%9D%91"" target=""_blank"">
                            	曹杨二村</a>
		                        </strong>
                            </div>
                            <div class=""margin-bottom5"">
                            	普陀区 
                            					曹杨路1107弄,</div>
            ";
            */

            //匹配小区列表
            string pattern = "<div class=\"fsize14 margin-bottom8\">\\s+<strong>\\s+<a\\s+[^>]+>\\s+(.+?)</a>\\s+</strong>"+
                            "\\s+</div>\\s+<div class=\"margin-bottom5\">([^<]+)</div>";
            //获取所有的匹配
            string name, address; //小区名字和地址
            MatchCollection mc = Regex.Matches(txt, pattern);
            foreach (Match m in mc)
            {
                name = Regex.Replace(m.Value, pattern, "$1");
                address = Regex.Replace(m.Value, pattern, "$2");
                address = Regex.Replace(address, "[\\s,( )]+", "");
                Save(name+" "+address);
            }
            Console.WriteLine("第" + page + "页采集到" + mc.Count + "条!");
            return mc.Count;
        }
        static void Save(string str)
        {
            using (StreamWriter sw = new StreamWriter(file, true, Encoding.UTF8))
            {
                sw.WriteLine(str);
                sw.Flush();
            }
        }
    }
}

运行程序:

DEMO下载

posted on 2011-03-23 00:04  New.min  阅读(801)  评论(0编辑  收藏  举报

导航