ChineseCounter.cs 统计中文文本中常用字占比

http://www.tuicool.com/articles/qmMba2  
1 using System;
  2 using System.IO;
  3 using System.Collections.Generic;
  4 
  5 namespace Skyiv.Utils
  6 {
  7   // 对中文文本进行统计分析，主要统计其中常用字和次常用字的占比。
  8   sealed class ChineseCounter
  9   {
 10     static readonly string Skiped =
 11       "" +
 12       "　、，。．·？！：；…－─～—＿｜丨Ⅰ∶※★●℃°“”‘’《》［］〔〕（）＜＞〈〉【】〖〗□" +
 13       "㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩①②③④⑤⑥⑦⑧⑨⑩⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽⑾⑿⒀⒁⒂⒃⒄⒅⒆⒇" +
 14       "ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑⒒⒓⒔⒕⒖⒗⒘⒚⒛" +
 15       "１２３４５６７８９０＝＋－×÷／％ⅢⅡ≈⊥′āáǎàéěèōóǒü" + // ○
 16       " 1234567890=+-*/%{}[]()<>?!@#$^&_:;',.`~|\"\\" +
 17       "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ" +
 18       "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ" +
 19       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 20     static readonly string Group = "ABCT"; // A:常用字 B:次常用字 C:其他 T:合计
 21     static readonly HashSet<char> A, B;    // A:常用字(2500字) B:次常用字(1000字)
 22     
 23     Dictionary<char, int> charCount = new Dictionary<char, int>();
 24     
 25     static ChineseCounter()
 26     {
 27       A = new HashSet<char>(new RandomChinese(RandomChinese.Source.From2500).GetSource());
 28       B = new HashSet<char>(new RandomChinese(RandomChinese.Source.From3500).GetSource());
 29       B.ExceptWith(A); // 次常用字(1000字) = 现代汉语常用字(3500字) - 常用字(2500字)
 30     }
 31     
 32     // 读取中文文本的内容，计算每个汉字出现的次数
 33     void Read(string fileName)
 34     {
 35       foreach (var line in File.ReadLines(fileName))
 36         foreach (var c in line)
 37         {
 38           int count;
 39           charCount.TryGetValue(c, out count);
 40           charCount[c] = count + 1;
 41         }
 42     }
 43     
 44     // 计算统计资料中常用字和次常用字出现的次数
 45     int[,] GetGroups()
 46     {
 47       var groups = new int[4, 2]; // 0:常用字 1:次常用字 2:其他 3:合计,  0:Distinct 1:Count
 48       foreach (var kvp in charCount)
 49       {
 50         var k = GetGroupIndex(kvp.Key);
 51         groups[k, 0]++;
 52         groups[3, 0]++;
 53         groups[k, 1] += kvp.Value;
 54         groups[3, 1] += kvp.Value;
 55       }
 56       return groups;
 57     }
 58     
 59     // 将统计资料中的汉字按其出现的次数降序排序
 60     Tuple<int, char>[] GetItems()
 61     {
 62       var items = new Tuple<int, char>[charCount.Count]; // Item1:count Item2:char
 63       var i = 0;
 64       foreach (var kvp in charCount) items[i++] = Tuple.Create(-kvp.Value, kvp.Key);
 65       Array.Sort(items);
 66       return items;
 67     }
 68     
 69     // 报告统计资料中常用字和次常用字的占比等分析数据
 70     void Report(int[,] groups)
 71     {
 72       for (var i = 0; i < groups.GetLength(0); i++)
 73         Console.WriteLine("{5}: {0,10:N0} {1,7:P} | {2,5:N0} {3,7:P} | {4,7:F2}",
 74           groups[i, 1], groups[i, 1] / (double)groups[3, 1],
 75           groups[i, 0], groups[i, 0] / (double)groups[3, 0],
 76           groups[i, 1] / (double)groups[i, 0], Group[i]);
 77       Console.WriteLine();
 78     }
 79     
 80     // 报告没有在统计资料中出现的常用字和次常用字
 81     void Report(HashSet<char> set, int idx)
 82     {
 83       var set2 = new HashSet<char>(set);
 84       set2.ExceptWith(charCount.Keys);
 85       Console.Write("({0}:{1}) ", Group[idx], set2.Count);
 86       foreach (var c in set2) Console.Write(c);
 87       Console.WriteLine();
 88     }
 89     
 90     // 报告统计资料中每个的汉字的出现次数(降序)及占比
 91     void Report(Tuple<int, char>[] items, double total)
 92     {
 93       Console.WriteLine();
 94       for (int sum = 0, i = 0; i < items.Length; i++)
 95         Console.WriteLine("{0}: {1,7:N0} [{2}] {3,6:P} {4,7:P} {5,5:N0}",
 96           Group[GetGroupIndex(items[i].Item2)], -items[i].Item1, items[i].Item2,
 97           -items[i].Item1 / total, (sum += -items[i].Item1) / total, i + 1);
 98       Console.WriteLine("End");
 99     }
100 
101     // 将汉字分配到以下三组中: 0:常用字 1:次常用字 2:其他汉字
102     int GetGroupIndex(char c)
103     {
104       return (A.Contains(c)) ? 0 : (B.Contains(c)) ? 1 : 2;
105     }
106     
107     void Run(string[] fileNames)
108     {
109       foreach (var fileName in fileNames) Read(fileName);
110       foreach (var c in Skiped) charCount.Remove(c);
111       var groups = GetGroups();
112       Report(groups); // 报告统计资料中常用字和次常用字的占比
113       Report(A, 0);   // 报告没有在统计资料中出现的常用字
114       Report(B, 1);   // 报告没有在统计资料中出现的次常用字
115       Report(GetItems(), groups[3, 1]); // groups[3, 1]: 总字数
116     }
117 
118     static void Main(string[] args)
119     {
120       new ChineseCounter().Run(args);
121     }
122   }
123 }
posted on 2013-07-14 17:36 武胜-阿伟阅读(565) 评论(0) 编辑收藏举报