关于怎么在10万个手机号码中选择重复号码的问题。

刚才看到这篇博客。

http://www.cnblogs.com/WindBlog/archive/2011/07/21/2112452.html大家一般都认为用Hash的办法。不过其实还有更高效的算法。

计算机图形学中，有个八叉树量化法，是用来从24颜色中查找重复颜色，并且进行计数归并的算法。它的算法思想是八叉树一共8层，每层都有8个节点，每一条路径从根到页正好对应8个位.

而颜色RGB 三个数字正好可以拼成一个由0-7数字组成的8位数字，于是正好可以用八叉树算法进行插入。

比如：

八叉树比较复杂，不过对于我们的这个需求来说，其实比较简单。

我们这个算法是10叉树，每层都保存了0-9 10个数字。层数就是手机号码的长度。

手机号的第一位就是第一层，只需遍历到最后一层即可判断是否重复。

于是让我们来实现这个十叉树。效率都和回复中的Linq做比较。

View Code

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5
  6 namespace ConsoleApplication1
  7 {
  8     class Program
  9     {
10         static void Main(string[] args)
11         {
12             //示例数组，存放手机号
13             string[] mobileArray = new string[100000];// { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234" };
14
15             for (int i = 0; i < 100000; i++)
16             {
17                 mobileArray[i] = "1390000"
18                     + (i.ToString().Length > 4 ? i.ToString().Substring(0, 4) : (i.ToString() + "0000").Substring(0, 4));
19             }
20
21             ////linq语句来实现【select mobile from tmpTable group by mobile having count(*)>1】的效果
22             var selMobile = from n in mobileArray group n by n into g where g.Count() > 1 select g.Distinct();// select g;
23
24
25
26             System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
27             sw.Reset();
28             sw.Start();
29             int count1 = 0;
30             //通过两层循环输出重复的手机号
31             foreach (var mobile in selMobile)
32             {
33                 foreach (string multiMobile in mobile)
34                 {
35                     count1++;
36                     //Console.WriteLine(multiMobile);
37                 }
38             }
39
40             sw.Stop();
41
42             Console.WriteLine("Linq共有重复号" + count1+"耗时"+ sw.ElapsedTicks );
43
44             TenNodeTree tree = new TenNodeTree();
45             TenNodeTree tree2 = new TenNodeTree();
46
47             sw.Reset();
48             sw.Start();
49             int count2 = 0;
50             //mobileArray = new string[] { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234", "13900001236" };
51             foreach (var item in mobileArray)
52             {
53                 if (!tree.Add(item))
54                 {
55                     if (tree2.Add(item))
56                     {
57                         count2++;
58                     }
59                 }
60
61             }
62
63             sw.Stop();
64
65             Console.WriteLine("十叉树共有重复号" + count1 + "耗时" + sw.ElapsedTicks);
66             Console.ReadLine();
67
68         }
69
70         class TenNodeTree
71         {
72             public TenNode Root = new TenNode();
73
74             public bool Add(string no)
75             {
76                 TenNode cnode = Root;
77                 bool isadd = false ;
78                 for (int i = 0; i < no.Length ; i++)
79                 {
80                     char k = no[i];
81
82                     if (cnode.Child[k] == null)
83                     {
84                         isadd = true;
85                         cnode.Child[k] = new TenNode();
86                     }
87                     cnode = cnode.Child[k];
88                 }
89
90                 return isadd;
91
92             }
93
94         }
95
96         class TenNode
97         {
98             public TenNode[] Child = new TenNode[256];
99         }
100
101
102     }
103 }

不过，运行一下，你会发现效率完全不是 Linq的对手：

Linq共有重复号9000耗时143185
十叉树共有重复号9000耗时411221

但是，你可不要以为这个算法有问题，要知道Linq是经过高度优化的，我们的算法的实现还有优化空间。让我们来优化它!

怎么优化？指针!有了指针，C#的性能可以提高N倍，见指针版代码:

View Code

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ConsoleApplication1
{
    unsafe class Program
    {
        static void Main(string[] args)
        {
            //示例数组，存放手机号
            string[] mobileArray = new string[100000];// { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234" };

            for (int i = 0; i < 100000; i++)
            {
                mobileArray[i] = "1390000"
                    + (i.ToString().Length > 4 ? i.ToString().Substring(0, 4) : (i.ToString() + "0000").Substring(0, 4));
            }

            ////linq语句来实现【select mobile from tmpTable group by mobile having count(*)>1】的效果
            var selMobile = from n in mobileArray group n by n into g where g.Count() > 1 select g.Distinct();// select g;

            System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
            sw.Reset();
            sw.Start();
            int count1 = 0;
            //通过两层循环输出重复的手机号
            foreach (var mobile in selMobile)
            {
                foreach (string multiMobile in mobile)
                {
                    count1++;
                    //Console.WriteLine(multiMobile);
                }
            }

            sw.Stop();

            Console.WriteLine("Linq共有重复号" + count1+"耗时"+ sw.ElapsedTicks );

            TenNodeTree tree = new TenNodeTree();
            TenNodeTree tree2 = new TenNodeTree();

            sw.Reset();
            sw.Start();
            int count2 = 0;
            //mobileArray = new string[] { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234", "13900001236" };

            foreach (var item in mobileArray)
            {
                fixed (char* no = item)
                {
                    if (!tree.Add(no , 11 ))
                    {
                        if (tree2.Add(no,11))
                        {
                            count2++;
                        }
                    }
                }

            }

            sw.Stop();

            Console.WriteLine("十叉树共有重复号" + count1 + "耗时" + sw.ElapsedTicks);
            Console.ReadLine();

        }

        class TenNodeTree
        {
            public TenNode Root = new TenNode();

            public bool Add(char* no,int len)
            {
                TenNode cnode = Root;
                bool isadd = false ;

                for (int i = 0; i < len; i++)
                {
                    char k = *no;

                    if (cnode.Child[k-48] == null)
                    {
                        isadd = true;
                        cnode.Child[k-48] = new TenNode();
                    }
                    cnode = cnode.Child[k-48];

                    no++;

                }

                return isadd;

            }

        }

        class TenNode
        {
            public TenNode[] Child = new TenNode[10];
        }

    }
}

Linq共有重复号9000耗时139310
十叉树共有重复号9000耗时69545

如何？效率已达到Linq的1倍!

这还不算完，我们还没有使用Release模式呢!

Linq共有重复号9000耗时141970
十叉树共有重复号9000耗时35843

Release后，性能又提升1倍!

大家不妨用其他语言来实现下，比比效率如何？

C#还是很强的，HOHO

==================================

今天又做了测试，发现我家的老笔记本上，是十叉树占优，但是公司的电脑上是HashSet比较快。

不过十叉树应该还没有达到最优化，应该是分配节点时的开销过大导致。暂时想不出更好的优化方法-_-

==================================

五分钟后再次测试，十叉树只需在初始化时预先分配一个节点池，即可完胜HashSet.不过，此法或有胜之不武的嫌疑，哈哈。

也就是说，不算实例化的时间，十叉树胜，算实例化时间，哈希胜,

posted @ 2011-07-21 18:40 烙馅饼喽阅读(6523) 评论(38) 编辑收藏举报

刷新页面返回顶部

ashei

关于怎么在10万个手机号码中选择重复号码的问题。

公告