C#多线程词频统计修改

上一篇真是让大家见笑了,我今天才真正明白了“多线程”的意义。

今天拿到了真正的测试数据,大约380MB左右,用我的双线程大概能跑到16s左右。

但是惊闻有同学跑到了6s!

遂不甘心啊,果断继续修改之。

然后发现了多线程真正的奥义————建多个dictionary分别统计,最后再merge。

实际上索引是整个程序最耗时的地方,由于单词种类多达200k,所以如此庞大的一个dictionary每次的索引和更新是非常废时间的。

所以在新的多线程程序中,建了一个线程数组,每个线程负责各自的一个dictionary,

等每个线程都跑完之后,再进行Merge。

这样整个程序的时间就上去了。

380MB大约7s时间。

代码见下:

  1     class Program
  2     {
  3         const int ThreadNum = 7;
  4         static ConcurrentDictionary<string, int>[] result = new ConcurrentDictionary<string, int>[ThreadNum];
  5         static ConcurrentDictionary<string, int> ResultMerge;
  6         static int[] tablet = new int[128];
  7         static BlockingCollection<string> queue;
  8         static Thread[] WorkerTh = new Thread[ThreadNum];
  9         static Thread FileIOth;
 10         static Semaphore sem = new Semaphore(0, ThreadNum);
 11         static SemaphoreSlim semslim = new SemaphoreSlim(0, ThreadNum);
 12         static void Main(string[] args)
 13         {
 14             if (args.Length != 3)
 15             {
 16                 Console.WriteLine("Command Line format: WFC rootdir N filePattern");
 17                 Console.WriteLine("WFC: executable file name");
 18                 Console.WriteLine("rootdir: the root directory of input files");
 19                 Console.WriteLine("N: the count of output words");
 20                 Console.WriteLine("filepattern: the name pattern of files to be scanned. For example, *.txt");
 21                 return;
 22             }
 23             DateTime dt = DateTime.Now;
 24             string rootdir = args[0];
 25             int N = Convert.ToInt32(args[1]);
 26             string filePattern = args[2];
 27             if (!Directory.Exists(rootdir)) 
 28             {
 29                 Console.WriteLine("The path "+ rootdir+" doesn't exist.");
 30                 return;
 31             }
 32             string[] files = Directory.GetFiles(rootdir, filePattern, SearchOption.AllDirectories);
 33             if(files.Length==0)
 34             {
 35                 Console.WriteLine("Can not find any "+filePattern+ " pattern's file.");
 36                 return;
 37             }
 38 
 39             for (int i = 'a'; i <= 'z'; i++)
 40             {
 41                 tablet[i] = 1;
 42             }
 43             for (int i = 'A'; i <= 'Z'; i++)
 44             {
 45                 tablet[i] = 1;
 46             }
 47 
 48             //ManagementClass m = new ManagementClass(new ManagementPath( "Win32_Processor"));
 49             //ManagementObjectCollection moc = m.GetInstances();
 50             //int CPUNum = 0;
 51 
 52             //string NumOfCore="";
 53             //foreach (ManagementObject mo in moc)
 54             //{
 55             //    PropertyDataCollection properties = mo.Properties;
 56             //    NumOfCore += properties["NumberOfCores"].Value;
 57             //}
 58             //CPUNum = Convert.ToInt32(NumOfCore);
 59 
 60 
 61 
 62             queue = new BlockingCollection<string>(100);
 63             FileIOth = new Thread(delegate() { Read(files); });
 64             FileIOth.Start();
 65 
 66             ResultMerge = new ConcurrentDictionary<string, int>(1, 8 * N);
 67 
 68             for (int i = 0; i < result.Length; i++)
 69             {
 70                 result[i] = new ConcurrentDictionary<string, int>(1, 20000);
 71             }
 72 
 73 
 74             int index = -1;
 75             for (int i = 0; i < result.Length - 1; i++)
 76             {
 77                 WorkerTh[i] = new Thread(delegate()
 78                 {
 79                     index++;
 80                     Process(index);
 81                     semslim.Release(1);
 82                 });
 83                 WorkerTh[i].Start();
 84             }
 85 
 86             WorkerTh[result.Length - 1] = new Thread(delegate()
 87             {
 88                
 89                 Process(result.Length - 1);
 90                 semslim.Release(1);
 91                 while (true) 
 92                 {
 93                     if (semslim.CurrentCount == ThreadNum)
 94                         break;
 95                 }
 96                 int count = 0;
 97                 for (int i = 0; i < result.Length; i++)
 98                 {
 99                     count = 0;
100                     foreach (var item in result[i].OrderByDescending(k=>k.Value)) 
101                     {
102                         ResultMerge.AddOrUpdate(item.Key, item.Value, (k, v) => v + item.Value);
103                         count++;
104                         if (count > 5 * N)
105                             break;
106                     }
107                 }
108                 count = 0;
109                 var FinaloutputResult = from KVP in ResultMerge
110                                         orderby KVP.Value descending
111                                         select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
112                 foreach (var str in FinaloutputResult)
113                 {
114                     Console.WriteLine(str);
115                     count++;
116                     if (count > N - 1) break;
117                 }
118                 DateTime ot = DateTime.Now;
119                 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms");
120                 //Console.ReadKey();
121 
122             });
123             WorkerTh[result.Length - 1].Start();
124 
125         }
126 
127         public static void Read(string[] files)
128         {
129             foreach (string file in files)
130             {
131                 queue.TryAdd(ReadFile(file), -1);
132             }
133             queue.TryAdd("END", -1);
134         }
135 
136         public static string ReadFile(string file)
137         {
138             string readLine;
139             FileStream fs = new FileStream(file, FileMode.Open);
140             StreamReader sr = new StreamReader(fs);
141             readLine = sr.ReadToEnd();
142             sr.Close();
143             fs.Close();
144             return readLine;
145         }
146 
147         public static void Process(int index)
148         {
149             string readLine;
150             while (true)
151             {
152                 queue.TryTake(out readLine, -1);
153                 if (readLine == "END")
154                 {
155                     queue.TryAdd("END", -1);
156                     break;
157                 }
158                 Compute(readLine, index);
159             }
160         }
161 
162         public static void Compute(string readLine, int index)
163         {
164             StringBuilder sb = new StringBuilder(100);
165             string strKey = "";
166             int state = 0;
167             for (int i = 0; i < readLine.Length; i++)
168             {
169                 switch (state)
170                 {
171                     case 0:
172                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)
173                         {
174                             sb.Clear();
175                             sb.Append(readLine[i]);
176                         }
177                         break;
178                     default:
179                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)
180                         {
181                             if (sb.Length >= 1)
182                             {
183                                 strKey = ToLower(sb).ToString();
184                                 //if (!result[index].TryGetValue(strKey, out value))
185                                 //{
186                                 //    result[index].TryAdd(strKey, 1);
187                                 //}
188                                 //else
189                                 //{
190                                 //    result[index][strKey]++;
191                                 //}
192                                 result[index].AddOrUpdate(strKey, 1, (k, v) => v + 1);
193                             }
194                         }
195                         else
196                             sb.Append(readLine[i]);
197                         break;
198                 }
199             }
200         }
201 
202         public static StringBuilder ToLower(StringBuilder str)
203         {
204             for (int i = 0; i < str.Length; i++)
205             {
206                 if (str[i] <= 'Z')
207                 {
208                     str[i] = (char)((int)str[i] + 32);
209                 }
210             }
211             return str;
212         }
View Code

我的电脑是i7-2600, 4核8线程。本来还想根据cpu的个数来动态的调线程池大小,后来发现读取cpu参数那行代码就耗时1s。。遂果断放弃。

在多次测试后发现7,8个线程的表现均良好。

另外wencong大神用的Linq自动并行化神马神马的方法可以跑进6s。

还有惊闻guojia居然只用了1.2s。。。。。。。。。。

posted @ 2013-10-14 22:07  RheetZheng  阅读(498)  评论(0编辑  收藏  举报