http://www.tuicool.com/articles/qmMba2
1 using System;
2 using System.IO;
3 using System.Collections.Generic;
4
5 namespace Skyiv.Utils
6 {
7
8 sealed class ChineseCounter
9 {
10 static readonly string Skiped =
11 "" +
12 " 、,。.·?!:;…-─~—_|丨Ⅰ∶※★●℃°“”‘’《》[]〔〕()<>〈〉【】〖〗□" +
13 "㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩①②③④⑤⑥⑦⑧⑨⑩⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽⑾⑿⒀⒁⒂⒃⒄⒅⒆⒇" +
14 "ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑⒒⒓⒔⒕⒖⒗⒘⒚⒛" +
15 "1234567890=+-×÷/%ⅢⅡ≈⊥′āáǎàéěèōóǒü" +
16 " 1234567890=+-*/%{}[]()<>?!@#$^&_:;',.`~|\"\\" +
17 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
18 "abcdefghijklmnopqrstuvwxyz" +
19 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
20 static readonly string Group = "ABCT";
21 static readonly HashSet<char> A, B;
22
23 Dictionary<char, int> charCount = new Dictionary<char, int>();
24
25 static ChineseCounter()
26 {
27 A = new HashSet<char>(new RandomChinese(RandomChinese.Source.From2500).GetSource());
28 B = new HashSet<char>(new RandomChinese(RandomChinese.Source.From3500).GetSource());
29 B.ExceptWith(A);
30 }
31
32
33 void Read(string fileName)
34 {
35 foreach (var line in File.ReadLines(fileName))
36 foreach (var c in line)
37 {
38 int count;
39 charCount.TryGetValue(c, out count);
40 charCount[c] = count + 1;
41 }
42 }
43
44
45 int[,] GetGroups()
46 {
47 var groups = new int[4, 2];
48 foreach (var kvp in charCount)
49 {
50 var k = GetGroupIndex(kvp.Key);
51 groups[k, 0]++;
52 groups[3, 0]++;
53 groups[k, 1] += kvp.Value;
54 groups[3, 1] += kvp.Value;
55 }
56 return groups;
57 }
58
59
60 Tuple<int, char>[] GetItems()
61 {
62 var items = new Tuple<int, char>[charCount.Count];
63 var i = 0;
64 foreach (var kvp in charCount) items[i++] = Tuple.Create(-kvp.Value, kvp.Key);
65 Array.Sort(items);
66 return items;
67 }
68
69
70 void Report(int[,] groups)
71 {
72 for (var i = 0; i < groups.GetLength(0); i++)
73 Console.WriteLine("{5}: {0,10:N0} {1,7:P} | {2,5:N0} {3,7:P} | {4,7:F2}",
74 groups[i, 1], groups[i, 1] / (double)groups[3, 1],
75 groups[i, 0], groups[i, 0] / (double)groups[3, 0],
76 groups[i, 1] / (double)groups[i, 0], Group[i]);
77 Console.WriteLine();
78 }
79
80
81 void Report(HashSet<char> set, int idx)
82 {
83 var set2 = new HashSet<char>(set);
84 set2.ExceptWith(charCount.Keys);
85 Console.Write("({0}:{1}) ", Group[idx], set2.Count);
86 foreach (var c in set2) Console.Write(c);
87 Console.WriteLine();
88 }
89
90
91 void Report(Tuple<int, char>[] items, double total)
92 {
93 Console.WriteLine();
94 for (int sum = 0, i = 0; i < items.Length; i++)
95 Console.WriteLine("{0}: {1,7:N0} [{2}] {3,6:P} {4,7:P} {5,5:N0}",
96 Group[GetGroupIndex(items[i].Item2)], -items[i].Item1, items[i].Item2,
97 -items[i].Item1 / total, (sum += -items[i].Item1) / total, i + 1);
98 Console.WriteLine("End");
99 }
100
101
102 int GetGroupIndex(char c)
103 {
104 return (A.Contains(c)) ? 0 : (B.Contains(c)) ? 1 : 2;
105 }
106
107 void Run(string[] fileNames)
108 {
109 foreach (var fileName in fileNames) Read(fileName);
110 foreach (var c in Skiped) charCount.Remove(c);
111 var groups = GetGroups();
112 Report(groups);
113 Report(A, 0);
114 Report(B, 1);
115 Report(GetItems(), groups[3, 1]);
116 }
117
118 static void Main(string[] args)
119 {
120 new ChineseCounter().Run(args);
121 }
122 }
123 }