关于怎么在10万个手机号码中选择重复号码的问题(目前最高效的算法)
晚上看到有算法分享关于怎么在10万个手机号码中选择重复号码的问题。
刚好晚上有空,也写了一个算法。

Dictionary<int, int> dic = new Dictionary<int, int>();
int count3 = 0;
foreach (var item in mobileArray)
{
var hashCode = item.GetHashCode();
int outInt = 0;
if (dic.TryGetValue(hashCode, out outInt))
{
if (outInt == 1)
{
count3++;
dic[hashCode] = 2;
}
}
else
dic[hashCode] = 1;
}
int count3 = 0;
foreach (var item in mobileArray)
{
var hashCode = item.GetHashCode();
int outInt = 0;
if (dic.TryGetValue(hashCode, out outInt))
{
if (outInt == 1)
{
count3++;
dic[hashCode] = 2;
}
}
else
dic[hashCode] = 1;
}
有下面几点需要注意:
- Dictionary的Key本身是hash,效率很高
- 相同的字符串在.net实际上是同一个地址,所以GetHashCode是一样的。
效果:
欢迎各位高手弄出个更快的算法
所有代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace 手机号码重复算法
{
unsafe class Program
{
static void Main(string[] args)
{
//示例数组,存放手机号
string[] mobileArray = new string[100000];// { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234" };
for (int i = 0; i < 100000; i++)
{
mobileArray[i] = "1390000"
+ (i.ToString().Length > 4 ? i.ToString().Substring(0, 4) : (i.ToString() + "0000").Substring(0, 4));
}
////linq语句来实现【select mobile from tmpTable group by mobile having count(*)>1】的效果
var selMobile = from n in mobileArray group n by n into g where g.Count() > 1 select g.Distinct();// select g;
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Reset();
sw.Start();
int count1 = 0;
//通过两层循环输出重复的手机号
foreach (var mobile in selMobile)
{
foreach (string multiMobile in mobile)
{
count1++;
//Console.WriteLine(multiMobile);
}
}
sw.Stop();
Console.WriteLine("Linq共有重复号" + count1 + "耗时" + sw.ElapsedMilliseconds);
TenNodeTree tree = new TenNodeTree();
TenNodeTree tree2 = new TenNodeTree();
sw.Reset();
sw.Start();
int count2 = 0;
//mobileArray = new string[] { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234", "13900001236" };
foreach (var item in mobileArray)
{
fixed (char* no = item)
{
if (!tree.Add(no, 11))
{
if (tree2.Add(no, 11))
{
count2++;
}
}
}
}
sw.Stop();
Console.WriteLine("十叉树共有重复号" + count1 + "耗时" + sw.ElapsedMilliseconds);
sw.Restart();
Dictionary<int, int> dic = new Dictionary<int, int>();
int count3 = 0;
foreach (var item in mobileArray)
{
var hashCode = item.GetHashCode();
int outInt = 0;
if (dic.TryGetValue(hashCode, out outInt))
{
if (outInt == 1)
{
count3++;
dic[hashCode] = 2;
}
}
else
dic[hashCode] = 1;
}
sw.Stop();
Console.WriteLine("hash计算共有重复号" + count3 + "耗时" + sw.ElapsedMilliseconds);
Console.ReadLine();
}
class TenNodeTree
{
public TenNode Root = new TenNode();
public bool Add(char* no, int len)
{
TenNode cnode = Root;
bool isadd = false;
for (int i = 0; i < len; i++)
{
char k = *no;
if (cnode.Child[k - 48] == null)
{
isadd = true;
cnode.Child[k - 48] = new TenNode();
}
cnode = cnode.Child[k - 48];
no++;
}
return isadd;
}
}
class TenNode
{
public TenNode[] Child = new TenNode[10];
}
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace 手机号码重复算法
{
unsafe class Program
{
static void Main(string[] args)
{
//示例数组,存放手机号
string[] mobileArray = new string[100000];// { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234" };
for (int i = 0; i < 100000; i++)
{
mobileArray[i] = "1390000"
+ (i.ToString().Length > 4 ? i.ToString().Substring(0, 4) : (i.ToString() + "0000").Substring(0, 4));
}
////linq语句来实现【select mobile from tmpTable group by mobile having count(*)>1】的效果
var selMobile = from n in mobileArray group n by n into g where g.Count() > 1 select g.Distinct();// select g;
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Reset();
sw.Start();
int count1 = 0;
//通过两层循环输出重复的手机号
foreach (var mobile in selMobile)
{
foreach (string multiMobile in mobile)
{
count1++;
//Console.WriteLine(multiMobile);
}
}
sw.Stop();
Console.WriteLine("Linq共有重复号" + count1 + "耗时" + sw.ElapsedMilliseconds);
TenNodeTree tree = new TenNodeTree();
TenNodeTree tree2 = new TenNodeTree();
sw.Reset();
sw.Start();
int count2 = 0;
//mobileArray = new string[] { "13900001234", "13900001235", "13900001236", "13900001237", "13900001234", "13900001236" };
foreach (var item in mobileArray)
{
fixed (char* no = item)
{
if (!tree.Add(no, 11))
{
if (tree2.Add(no, 11))
{
count2++;
}
}
}
}
sw.Stop();
Console.WriteLine("十叉树共有重复号" + count1 + "耗时" + sw.ElapsedMilliseconds);
sw.Restart();
Dictionary<int, int> dic = new Dictionary<int, int>();
int count3 = 0;
foreach (var item in mobileArray)
{
var hashCode = item.GetHashCode();
int outInt = 0;
if (dic.TryGetValue(hashCode, out outInt))
{
if (outInt == 1)
{
count3++;
dic[hashCode] = 2;
}
}
else
dic[hashCode] = 1;
}
sw.Stop();
Console.WriteLine("hash计算共有重复号" + count3 + "耗时" + sw.ElapsedMilliseconds);
Console.ReadLine();
}
class TenNodeTree
{
public TenNode Root = new TenNode();
public bool Add(char* no, int len)
{
TenNode cnode = Root;
bool isadd = false;
for (int i = 0; i < len; i++)
{
char k = *no;
if (cnode.Child[k - 48] == null)
{
isadd = true;
cnode.Child[k - 48] = new TenNode();
}
cnode = cnode.Child[k - 48];
no++;
}
return isadd;
}
}
class TenNode
{
public TenNode[] Child = new TenNode[10];
}
}
}