移除重复字符的几个算法简单比较
1: class Program
2: {
3: static void Main(string[] args)
4: {
5: string s = File.ReadAllText(@"e:\test.txt");
6: Program p = new Program();
7: string r = p.RemoveDuplicatedChar_1(s);
8: r = p.RemoveDuplicatedChar_2(s);
9: r = p.RemoveDuplicatedChar_6(s);
10: }
11:
12: /// <summary>
13: /// 既然是C#,首先应该想到的是利用类库现有的实现
14: /// HashSet是理智的选择,要注意这是3.5才有的类
15: /// </summary>
16: public string RemoveDuplicatedChar_1(string s)
17: {
18: if (string.IsNullOrEmpty(s) || s.Length < 2)
19: {
20: return s;
21: }
22:
23: HashSet<char> hs = new HashSet<char>(s.ToCharArray());
24: return new string(hs.ToArray());
25: }
26:
27: /// <summary>
28: /// Distinct函数也可以做到
29: /// </summary>
30: public string RemoveDuplicatedChar_2(string s)
31: {
32: if (string.IsNullOrEmpty(s) || s.Length < 2)
33: {
34: return s;
35: }
36:
37: return new string(s.Distinct().ToArray());
38: }
39:
40: //Distinct的实现使用到了链表和哈希,感兴趣的可以参考一下实现
41: /****
42: [__DynamicallyInvokable]
43: public static IEnumerable<TSource> Distinct<TSource>(this IEnumerable<TSource> source)
44: {
45: if (source == null)
46: {
47: throw Error.ArgumentNull("source");
48: }
49: return DistinctIterator<TSource>(source, null);
50: }
51:
52: private static IEnumerable<TSource> DistinctIterator<TSource>(IEnumerable<TSource> source, IEqualityComparer<TSource> comparer)
53: {
54: Set<TSource> iteratorVariable0 = new Set<TSource>(comparer);
55: foreach (TSource iteratorVariable1 in source)
56: {
57: if (iteratorVariable0.Add(iteratorVariable1))
58: {
59: yield return iteratorVariable1;
60: }
61: }
62: }
63:
64: public bool Add(TElement value)
65: {
66: return !this.Find(value, true);
67: }
68:
69: private bool Find(TElement value, bool add)
70: {
71: int hashCode = this.InternalGetHashCode(value);
72: for (int i = this.buckets[hashCode % this.buckets.Length] - 1; i >= 0; i = this.slots[i].next)
73: {
74: if ((this.slots[i].hashCode == hashCode) && this.comparer.Equals(this.slots[i].value, value))
75: {
76: return true;
77: }
78: }
79: if (add)
80: {
81: int freeList;
82: if (this.freeList >= 0)
83: {
84: freeList = this.freeList;
85: this.freeList = this.slots[freeList].next;
86: }
87: else
88: {
89: if (this.count == this.slots.Length)
90: {
91: this.Resize();
92: }
93: freeList = this.count;
94: this.count++;
95: }
96: int index = hashCode % this.buckets.Length;
97: this.slots[freeList].hashCode = hashCode;
98: this.slots[freeList].value = value;
99: this.slots[freeList].next = this.buckets[index] - 1;
100: this.buckets[index] = freeList + 1;
101: }
102: return false;
103: }
104: ****/
105:
106: /// <summary>
107: /// 别忘了我们是在做题!所以假设来了
108: /// 第一个假设:字符存在范围'a'-'z'
109: /// 于是26个字母可以用32位的整型值来影射
110: /// </summary>
111: public string RemoveDuplicatedChar_3(string s)
112: {
113: if (string.IsNullOrEmpty(s) || s.Length < 2)
114: {
115: return s;
116: }
117:
118: char[] charArray = s.ToCharArray();
119: Int32 flags = 0;
120: int newIndex = 0;
121: for (int i = 0; i < charArray.Length; i++)
122: {
123: if (charArray[i] < 'a' || charArray[i] > 'z')
124: {
125: throw new ArgumentException("char should be in range(a-z)");
126: }
127:
128: int relative_position = (charArray[i]-'a') % 32;
129: if ((flags & (1 << relative_position))==0)
130: {
131: charArray[newIndex] = charArray[i];
132: newIndex++;
133: flags |= (1 << relative_position);
134: }
135: }
136:
137: return new string(charArray, 0, newIndex);
138: }
139:
140: /// <summary>
141: /// 256个ASCII码可以用8个32位整型值映射
142: /// </summary>
143: public string RemoveDuplicatedChar_4(string s)
144: {
145: if (string.IsNullOrEmpty(s) || s.Length < 2)
146: {
147: return s;
148: }
149:
150: char[] charArray = s.ToCharArray();
151: Int32[] flags = new Int32[8];
152: int newIndex = 0;
153:
154: for (int i = 0; i < charArray.Length; i++)
155: {
156: if (charArray[i]>255)
157: {
158: throw new ArgumentException("char should be in ASCII");
159: }
160:
161: int index = charArray[i] / 32;
162: int relative_position = charArray[i] % 32;
163: if ((flags[index] & (1 << relative_position)) == 0)
164: {
165: charArray[newIndex]=charArray[i];
166: newIndex++;
167: flags[index] |= (1 << relative_position);
168: }
169: }
170:
171: return new string(charArray, 0, newIndex);
172: }
173:
174: /// <summary>
175: /// 似曾相识
176: /// </summary>
177: public string RemoveDuplicatedChar_5(string s)
178: {
179: if (string.IsNullOrEmpty(s) || s.Length < 2)
180: {
181: return s;
182: }
183:
184: char[] charArray = s.ToCharArray();
185: bool[] flags = new bool[256];
186: int newIndex = 0;
187:
188: for (int i = 0; i < charArray.Length; i++)
189: {
190: if (charArray[i] > 255)
191: {
192: throw new ArgumentException("char should be in ASCII");
193: }
194:
195: char c = charArray[i];
196: if (!flags[c])
197: {
198: charArray[newIndex] = charArray[i];
199: newIndex++;
200: flags[c] = true;
201: }
202: }
203:
204: return new string(charArray, 0, newIndex);
205: }
206:
207: /// <summary>
208: /// O(n平方)的实现,没有借助额外的buffer数组或高级数据结构
209: /// 不用考虑是否为ASCII,在实际使用中是唯一有通用意义的算法,
210: /// 但是至于在处理大字符串时其时间效率应该要远低于使用第一个和第二个方法
211: /// 稍后给出运行时间比较
212: /// </summary>
213: public string RemoveDuplicatedChar_6(string s)
214: {
215: if (string.IsNullOrEmpty(s) || s.Length < 2)
216: {
217: return s;
218: }
219:
220: char[] charArray = s.ToCharArray();
221: int newIndex = 1;
222:
223: for (int i = 1; i < charArray.Length; i++)
224: {
225: int j = 0;
226: for (j = 0; j < newIndex; j++)
227: {
228: if (charArray[i] == charArray[j])
229: {
230: break;
231: }
232: }
233:
234: //no duplcaited char found in existed read sub-string
235: //then set the new char as the new index
236: if (j == newIndex)
237: {
238: charArray[newIndex] = charArray[i];
239: newIndex++;
240: }
241: }
242:
243: return new string(charArray, 0, newIndex);
244: }
245: }
在处理百万级别的文本时,1,2,6的运行时间比较:
/****
对于纯ASCII的大字符串,给出了5个方法的时间比较如下图
可以看出4,5在时间效率上超过了1和2
所以在处理混合型字符串时,是否应该考虑综合应用这些算法呢?
答案是明显的
****/