C#多线程词频统计修改
上一篇真是让大家见笑了,我今天才真正明白了“多线程”的意义。
今天拿到了真正的测试数据,大约380MB左右,用我的双线程大概能跑到16s左右。
但是惊闻有同学跑到了6s!
遂不甘心啊,果断继续修改之。
然后发现了多线程真正的奥义————建多个dictionary分别统计,最后再merge。
实际上索引是整个程序最耗时的地方,由于单词种类多达200k,所以如此庞大的一个dictionary每次的索引和更新是非常废时间的。
所以在新的多线程程序中,建了一个线程数组,每个线程负责各自的一个dictionary,
等每个线程都跑完之后,再进行Merge。
这样整个程序的时间就上去了。
380MB大约7s时间。
代码见下:
1 class Program 2 { 3 const int ThreadNum = 7; 4 static ConcurrentDictionary<string, int>[] result = new ConcurrentDictionary<string, int>[ThreadNum]; 5 static ConcurrentDictionary<string, int> ResultMerge; 6 static int[] tablet = new int[128]; 7 static BlockingCollection<string> queue; 8 static Thread[] WorkerTh = new Thread[ThreadNum]; 9 static Thread FileIOth; 10 static Semaphore sem = new Semaphore(0, ThreadNum); 11 static SemaphoreSlim semslim = new SemaphoreSlim(0, ThreadNum); 12 static void Main(string[] args) 13 { 14 if (args.Length != 3) 15 { 16 Console.WriteLine("Command Line format: WFC rootdir N filePattern"); 17 Console.WriteLine("WFC: executable file name"); 18 Console.WriteLine("rootdir: the root directory of input files"); 19 Console.WriteLine("N: the count of output words"); 20 Console.WriteLine("filepattern: the name pattern of files to be scanned. For example, *.txt"); 21 return; 22 } 23 DateTime dt = DateTime.Now; 24 string rootdir = args[0]; 25 int N = Convert.ToInt32(args[1]); 26 string filePattern = args[2]; 27 if (!Directory.Exists(rootdir)) 28 { 29 Console.WriteLine("The path "+ rootdir+" doesn't exist."); 30 return; 31 } 32 string[] files = Directory.GetFiles(rootdir, filePattern, SearchOption.AllDirectories); 33 if(files.Length==0) 34 { 35 Console.WriteLine("Can not find any "+filePattern+ " pattern's file."); 36 return; 37 } 38 39 for (int i = 'a'; i <= 'z'; i++) 40 { 41 tablet[i] = 1; 42 } 43 for (int i = 'A'; i <= 'Z'; i++) 44 { 45 tablet[i] = 1; 46 } 47 48 //ManagementClass m = new ManagementClass(new ManagementPath( "Win32_Processor")); 49 //ManagementObjectCollection moc = m.GetInstances(); 50 //int CPUNum = 0; 51 52 //string NumOfCore=""; 53 //foreach (ManagementObject mo in moc) 54 //{ 55 // PropertyDataCollection properties = mo.Properties; 56 // NumOfCore += properties["NumberOfCores"].Value; 57 //} 58 //CPUNum = Convert.ToInt32(NumOfCore); 59 60 61 62 queue = new BlockingCollection<string>(100); 63 FileIOth = new Thread(delegate() { Read(files); }); 64 FileIOth.Start(); 65 66 ResultMerge = new ConcurrentDictionary<string, int>(1, 8 * N); 67 68 for (int i = 0; i < result.Length; i++) 69 { 70 result[i] = new ConcurrentDictionary<string, int>(1, 20000); 71 } 72 73 74 int index = -1; 75 for (int i = 0; i < result.Length - 1; i++) 76 { 77 WorkerTh[i] = new Thread(delegate() 78 { 79 index++; 80 Process(index); 81 semslim.Release(1); 82 }); 83 WorkerTh[i].Start(); 84 } 85 86 WorkerTh[result.Length - 1] = new Thread(delegate() 87 { 88 89 Process(result.Length - 1); 90 semslim.Release(1); 91 while (true) 92 { 93 if (semslim.CurrentCount == ThreadNum) 94 break; 95 } 96 int count = 0; 97 for (int i = 0; i < result.Length; i++) 98 { 99 count = 0; 100 foreach (var item in result[i].OrderByDescending(k=>k.Value)) 101 { 102 ResultMerge.AddOrUpdate(item.Key, item.Value, (k, v) => v + item.Value); 103 count++; 104 if (count > 5 * N) 105 break; 106 } 107 } 108 count = 0; 109 var FinaloutputResult = from KVP in ResultMerge 110 orderby KVP.Value descending 111 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 112 foreach (var str in FinaloutputResult) 113 { 114 Console.WriteLine(str); 115 count++; 116 if (count > N - 1) break; 117 } 118 DateTime ot = DateTime.Now; 119 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 120 //Console.ReadKey(); 121 122 }); 123 WorkerTh[result.Length - 1].Start(); 124 125 } 126 127 public static void Read(string[] files) 128 { 129 foreach (string file in files) 130 { 131 queue.TryAdd(ReadFile(file), -1); 132 } 133 queue.TryAdd("END", -1); 134 } 135 136 public static string ReadFile(string file) 137 { 138 string readLine; 139 FileStream fs = new FileStream(file, FileMode.Open); 140 StreamReader sr = new StreamReader(fs); 141 readLine = sr.ReadToEnd(); 142 sr.Close(); 143 fs.Close(); 144 return readLine; 145 } 146 147 public static void Process(int index) 148 { 149 string readLine; 150 while (true) 151 { 152 queue.TryTake(out readLine, -1); 153 if (readLine == "END") 154 { 155 queue.TryAdd("END", -1); 156 break; 157 } 158 Compute(readLine, index); 159 } 160 } 161 162 public static void Compute(string readLine, int index) 163 { 164 StringBuilder sb = new StringBuilder(100); 165 string strKey = ""; 166 int state = 0; 167 for (int i = 0; i < readLine.Length; i++) 168 { 169 switch (state) 170 { 171 case 0: 172 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0) 173 { 174 sb.Clear(); 175 sb.Append(readLine[i]); 176 } 177 break; 178 default: 179 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0) 180 { 181 if (sb.Length >= 1) 182 { 183 strKey = ToLower(sb).ToString(); 184 //if (!result[index].TryGetValue(strKey, out value)) 185 //{ 186 // result[index].TryAdd(strKey, 1); 187 //} 188 //else 189 //{ 190 // result[index][strKey]++; 191 //} 192 result[index].AddOrUpdate(strKey, 1, (k, v) => v + 1); 193 } 194 } 195 else 196 sb.Append(readLine[i]); 197 break; 198 } 199 } 200 } 201 202 public static StringBuilder ToLower(StringBuilder str) 203 { 204 for (int i = 0; i < str.Length; i++) 205 { 206 if (str[i] <= 'Z') 207 { 208 str[i] = (char)((int)str[i] + 32); 209 } 210 } 211 return str; 212 }
我的电脑是i7-2600, 4核8线程。本来还想根据cpu的个数来动态的调线程池大小,后来发现读取cpu参数那行代码就耗时1s。。遂果断放弃。
在多次测试后发现7,8个线程的表现均良好。
另外wencong大神用的Linq自动并行化神马神马的方法可以跑进6s。
还有惊闻guojia居然只用了1.2s。。。。。。。。。。