C#多线程词频统计
此乃软件工程课的个人项目,小弟入门水平请轻拍。。
多谢anran大神指点@wanganran http://www.cnblogs.com/wanganran/archive/2012/09/25/2703179.html
以及参考了一个C#的多线程入门的文章 http://www.cnblogs.com/icuit/archive/2012/12/05/2800941.html
各位看官想要最终代码可直接跳转到最后。。
10.08:初步考虑用多线程(双线程,一个IO,一个compute),先快速写出单线程最最最简单的版本
1 string rootdir = args[0]; 2 int N = Convert.ToInt32(args[1]); 3 string filePattern = args[2]; 4 5 string[] files = Directory.GetFiles(rootdir, filePattern); 6 7 int[] tablet = new int[128]; 8 for (int i = 'a'; i <= 'z'; i++) 9 { 10 tablet[i] = 1; 11 } 12 for (int i = 'A'; i <= 'Z'; i++) 13 { 14 tablet[i] = 1; 15 } 16 StringBuilder sb = new StringBuilder(100); 17 Dictionary<string, int> result = new Dictionary<string, int>(50000); 18 19 string strKey = ""; 20 string readLine = ""; 21 foreach (string file in files) 22 { 23 FileStream fs = new FileStream(file,FileMode.Open); 24 StreamReader sr = new StreamReader(fs); 25 readLine = sr.ReadToEnd(); 26 sr.Close(); 27 fs.Close(); 28 29 int state = 0; 30 31 for (int i = 0; i < readLine.Length; i++) 32 { 33 switch (state) 34 { 35 case 0: 36 if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) != 0) 37 { 38 sb.Clear(); 39 sb.Append(readLine[i]); 40 } 41 break; 42 default: 43 if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) == 0) 44 { 45 if (sb.Length >= 1) 46 { 47 strKey = sb.ToString(); 48 strKey = strKey.ToLower(); 49 int value; 50 if (result.TryGetValue(strKey,out value)) 51 { 52 result[strKey]++; 53 } 54 else 55 { 56 result.Add(strKey,1); 57 } 58 } 59 } 60 else 61 sb.Append(readLine[i]); 62 break; 63 } 64 } 65 } 66 var outputResult = from KVP in result 67 orderby KVP.Value descending 68 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 69 int count = 0; 70 foreach (var str in outputResult) 71 { 72 Console.WriteLine(str); 73 count++; 74 if (count > N - 1) break; 75 }
考虑了简单的优化包括:
1, StringBuilder.Append
2, Dictionary 索引复杂度O(1)
3, Dictionary.trygetvalue代替Dictionary.containsKey();
4,估算单词个数提前开辟内存
性能分析:
考虑优化String.ToLower(),两种思路:
1, 自己写tolower函数
2, 先不tolower,最后再merge
自己写ToLower()函数(因为前面已经根据字母表对tolower的输入进行了限制,故此处可省略很多判断步骤):
public static StringBuilder ToLower(StringBuilder str) { for (int i = 0; i < str.Length; i++) { if (str[i] <= 'Z') { str[i] =(char) ((int)str[i]+32); } } return str; }
只需原来1/6的时间
10.09. 第一项任务是把tolower merge到最后一步完成。
即统计词频时区分大小写简历dictionary,最后再创建新的dictionary对每一个key tolower,再merge。
2小时后:遇到了一个问题
Dictionary<StringBuilder,int> 中的stringbuilder是地址形式存在的,每次Add必须新创建而不能单纯改变其值,否则dictionary.tryGetValue总是true
但是新创建sb类就会浪费时间,先通过string创建StringBuilder再sb.toString()反而多了一次消耗。
总的回顾发现区分大小写保存dictionary再merge的思想,增加了创建第一个dictionary时每次的索引时间,且每个单词多增加了一次string到StringBuilder的内存开辟时间,增加了一次遍历每个数组并索引的操作(建立新的dictionary),而收益是从每个单词执行一次tolower到每种单词(区分大小写)tolower一次。
遂决定暂时放弃这个思路。
下一步:决定设时间点统计一下io操作和计算操作所用时间比(此为用cmd跑程序,要比在ide中跑时间快,后面有些图是在ide中跑的程序)
测试数据集为原数据集复制粘贴若干次后,大小约100M的若干txt文件。
可以看到IO时间只占总时间的约1/8~1/10
所以目前程序开双线程意义不大,可以考虑先考虑继续优化单线程或开多线程处理。
2小时后:
感觉单线程的瓶颈在于索引,这点取决于dictionary数据结构,暂时先不考虑优化。
尝试动手写多线程。
决定先把单线程代码整理一下。
1 static StringBuilder sb = new StringBuilder(100); 2 static Dictionary<string, int> result = new Dictionary<string, int>(50000); 3 static int[] tablet = new int[128]; 4 static void Main(string[] args) 5 { 6 DateTime dt = DateTime.Now; 7 string rootdir = args[0]; 8 int N = Convert.ToInt32(args[1]); 9 string filePattern = args[2]; 10 11 string[] files = Directory.GetFiles(rootdir, filePattern); 12 13 for (int i = 'a'; i <= 'z'; i++) 14 { 15 tablet[i] = 1; 16 } 17 for (int i = 'A'; i <= 'Z'; i++) 18 { 19 tablet[i] = 1; 20 } 21 22 string readLine = ""; 23 24 DateTime COMt = new DateTime(); 25 DateTime IOt = new DateTime(); 26 DateTime OTHt = new DateTime(); 27 long COMtime = 0; 28 long IOtime = 0; 29 long OTHtime = 0; 30 31 foreach (string file in files) 32 { 33 IOt = DateTime.Now; 34 35 readLine = ReadFile(file); 36 37 IOtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - IOt.Second * 1000 - IOt.Millisecond; 38 39 COMt = DateTime.Now; 40 41 Compute(readLine); 42 43 COMtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - COMt.Second * 1000 - COMt.Millisecond; 44 } 45 46 OTHt = DateTime.Now; 47 48 var outputResult = from KVP in result 49 orderby KVP.Value descending 50 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 51 int count = 0; 52 foreach (var str in outputResult) 53 { 54 Console.WriteLine(str); 55 count++; 56 if (count > N - 1) break; 57 } 58 OTHtime = DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - OTHt.Second * 1000 - OTHt.Millisecond; 59 60 DateTime ot = DateTime.Now; 61 Console.WriteLine("IO time:" + IOtime + "ms. Compute time:" + COMtime + "ms. Other time: " + OTHtime + "ms."); 62 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 63 Console.ReadKey(); 64 } 65 66 public static string ReadFile(string file) 67 { 68 string readLine; 69 FileStream fs = new FileStream(file, FileMode.Open); 70 StreamReader sr = new StreamReader(fs); 71 readLine = sr.ReadToEnd(); 72 sr.Close(); 73 fs.Close(); 74 return readLine; 75 } 76 77 public static void Compute(string readLine) 78 { 79 string strKey = ""; 80 int value; 81 int state = 0; 82 83 for (int i = 0; i < readLine.Length; i++) 84 { 85 switch (state) 86 { 87 case 0: 88 if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) != 0) 89 { 90 sb.Clear(); 91 sb.Append(readLine[i]); 92 } 93 break; 94 default: 95 if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) == 0) 96 { 97 if (sb.Length >= 1) 98 { 99 //strKey = sb.ToString(); 100 //strKey = strKey.ToLower(); 101 strKey = ToLower(sb).ToString(); 102 if (result.TryGetValue(strKey, out value)) 103 { 104 result[strKey]++; 105 } 106 else 107 { 108 result.Add(strKey, 1); 109 } 110 } 111 } 112 else 113 sb.Append(readLine[i]); 114 break; 115 } 116 } 117 } 118 119 public static StringBuilder ToLower(StringBuilder str) 120 { 121 for (int i = 0; i < str.Length; i++) 122 { 123 if (str[i] <= 'Z') 124 { 125 str[i] = (char)((int)str[i] + 32); 126 } 127 } 128 return str; 129 } 130
多线程,参考wang anran大神使用BlockingCollection<T>
先写两个线程(IO与Compute),共享一个static dictionary<string,int>以及一个static BlockingCollection<string>,预测可以比单线程提高1/10左右用时(因为创建线程也耗时间)
晚9点:
双线程的已经写完,果然如预测提升了约1/10左右的时间:
但是发现了一个bug,单个字母的统计结果与另一位同学的程序的结果不符。
现在准备来检查一下原因。
终于找到了,原因在于对于字母的分割的歧义上。
按照qiufeng老师给出的规定:Treat all non-alphabet characters as split characters.
所有不在字母表中的字符(包括数字)均视为分隔符。例如:a3b应视为a和b两个单词进行统计。
明天决定进行多个线程协同统计的编写。难点在于如何判别多个线程都已处理完毕。
10.10 准备写一个线程IO,两个线程compute
先把双线程的代码贴出来备份(奇怪的是今天的双线程跑不出昨天的速度了。。忘记改了哪里。。)
1 class Program 2 { 3 static StringBuilder sb = new StringBuilder(100); 4 static Dictionary<string, int> result = new Dictionary<string, int>(50000); 5 static int[] tablet = new int[128]; 6 static BlockingCollection<string> queue=new BlockingCollection<string>(100); 7 static void Main(string[] args) 8 { 9 DateTime dt = DateTime.Now; 10 string rootdir = args[0]; 11 int N = Convert.ToInt32(args[1]); 12 string filePattern = args[2]; 13 string[] files = Directory.GetFiles(rootdir, filePattern); 14 15 for (int i = 'a'; i <= 'z'; i++) 16 { 17 tablet[i] = 1; 18 } 19 for (int i = 'A'; i <= 'Z'; i++) 20 { 21 tablet[i] = 1; 22 } 23 24 Thread FileIOth = new Thread(delegate() 25 { 26 Read(files); 27 }); 28 FileIOth.Start(); 29 30 31 Thread WorkerTh = new Thread(delegate() 32 { 33 Process(); 34 var outputResult = from KVP in result 35 orderby KVP.Value descending 36 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 37 int count = 0; 38 foreach (var str in outputResult) 39 { 40 Console.WriteLine(str); 41 count++; 42 if (count > N - 1) break; 43 } 44 DateTime ot = DateTime.Now; 45 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 46 Console.ReadKey(); 47 }); 48 WorkerTh.Start(); 49 50 51 } 52 53 public static void Read(string[] files) 54 { 55 foreach (string file in files) 56 { 57 queue.TryAdd(ReadFile(file), -1); 58 } 59 queue.TryAdd("END", -1); 60 } 61 62 public static string ReadFile(string file) 63 { 64 string readLine; 65 FileStream fs = new FileStream(file, FileMode.Open); 66 StreamReader sr = new StreamReader(fs); 67 readLine = sr.ReadToEnd(); 68 sr.Close(); 69 fs.Close(); 70 return readLine; 71 } 72 73 public static void Process() 74 { 75 string readLine; 76 while (true) 77 { 78 queue.TryTake(out readLine, -1); 79 if (readLine == "END") 80 { 81 queue.TryAdd("END", -1); 82 break; 83 } 84 Compute(readLine); 85 } 86 } 87 88 public static void Compute(string readLine) 89 { 90 string strKey = ""; 91 int value; 92 int state = 0; 93 94 for (int i = 0; i < readLine.Length; i++) 95 { 96 switch (state) 97 { 98 case 0: 99 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0) 100 { 101 sb.Clear(); 102 sb.Append(readLine[i]); 103 } 104 break; 105 default: 106 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0) 107 { 108 if (sb.Length >= 1) 109 { 110 strKey = ToLower(sb).ToString(); 111 if (tablet[strKey[0]] != 1) return; 112 if (result.TryGetValue(strKey, out value)) 113 { 114 result[strKey]++; 115 } 116 else 117 { 118 result.Add(strKey, 1); 119 } 120 } 121 } 122 else 123 sb.Append(readLine[i]); 124 break; 125 } 126 } 127 } 128 129 public static StringBuilder ToLower(StringBuilder str) 130 { 131 for (int i = 0; i < str.Length; i++) 132 { 133 if (str[i] <= 'Z') 134 { 135 str[i] = (char)((int)str[i] + 32); 136 } 137 } 138 return str; 139 } 140 }
多线程:两个compute一个io
多线程反而更慢。。。寻找原因。
请教了anran大神,大神表示lock非常占时间。。建议更换线程安全的dictionary。
果然。。速度提升了20%。。(先去吃饭,回来再写)
测试了4线程5线程…8线程
发现1个主线程1个io线程3个compute线程的速度最快
1 class Program 2 { 3 static ConcurrentDictionary<string, int> result = new ConcurrentDictionary<string, int>(1, 50000); 4 static int[] tablet = new int[128]; 5 static BlockingCollection<string> queue; 6 static Thread WorkerTh0; 7 static Thread WorkerTh1; 8 static Thread WorkerTh2; 9 static void Main(string[] args) 10 { 11 DateTime dt = DateTime.Now; 12 string rootdir = args[0]; 13 int N = Convert.ToInt32(args[1]); 14 string filePattern = args[2]; 15 16 string[] files = Directory.GetFiles(rootdir, filePattern); 17 18 for (int i = 'a'; i <= 'z'; i++) 19 { 20 tablet[i] = 1; 21 } 22 for (int i = 'A'; i <= 'Z'; i++) 23 { 24 tablet[i] = 1; 25 } 26 27 queue = new BlockingCollection<string>(100); 28 Thread FileIOth = new Thread(delegate() { Read(files); }); 29 FileIOth.Start(); 30 31 WorkerTh0 = new Thread(delegate() 32 { 33 Process(); 34 }); 35 WorkerTh0.Start(); 36 WorkerTh1 = new Thread(delegate() 37 { 38 Process(); 39 }); 40 WorkerTh1.Start(); 41 42 WorkerTh2 = new Thread(delegate() 43 { 44 Process(); 45 var outputResult = from KVP in result 46 orderby KVP.Value descending 47 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 48 int count = 0; 49 foreach (var str in outputResult) 50 { 51 Console.WriteLine(str); 52 count++; 53 if (count > N - 1) break; 54 } 55 DateTime ot = DateTime.Now; 56 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 57 //Console.ReadKey(); 58 }); 59 WorkerTh2.Start(); 60 61 62 } 63 64 public static void Read(string[] files) 65 { 66 foreach (string file in files) 67 { 68 queue.TryAdd(ReadFile(file), -1); 69 } 70 queue.TryAdd("END", -1); 71 } 72 73 public static string ReadFile(string file) 74 { 75 string readLine; 76 FileStream fs = new FileStream(file, FileMode.Open); 77 StreamReader sr = new StreamReader(fs); 78 readLine = sr.ReadToEnd(); 79 sr.Close(); 80 fs.Close(); 81 return readLine; 82 } 83 84 public static void Process() 85 { 86 string readLine; 87 while (true) 88 { 89 queue.TryTake(out readLine, -1); 90 if (readLine == "END") 91 { 92 queue.TryAdd("END", -1); 93 break; 94 } 95 Compute(readLine); 96 } 97 } 98 99 public static void Compute(string readLine) 100 { 101 StringBuilder sb = new StringBuilder(100); 102 string strKey = ""; 103 int value; 104 int state = 0; 105 106 for (int i = 0; i < readLine.Length; i++) 107 { 108 switch (state) 109 { 110 case 0: 111 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0) 112 { 113 sb.Clear(); 114 sb.Append(readLine[i]); 115 } 116 break; 117 default: 118 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0) 119 { 120 if (sb.Length >= 1) 121 { 122 strKey = ToLower(sb).ToString(); 123 result.AddOrUpdate(strKey, 1, (k, v) => v + 1); 124 } 125 } 126 else 127 sb.Append(readLine[i]); 128 break; 129 } 130 } 131 } 132 133 public static StringBuilder ToLower(StringBuilder str) 134 { 135 for (int i = 0; i < str.Length; i++) 136 { 137 if (str[i] <= 'Z') 138 { 139 str[i] = (char)((int)str[i] + 32); 140 } 141 } 142 return str; 143 } 144 }
决定等真实数据来了再测试一下效果。