1、找出一段文字中出现次数最多的前10个单词以及次数
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace ConsoleApplication1 { class Program { static void Main(string[] args) { string str = "wo men wo men wo ni hao ni hao we er ty ui o pp pp pp aa aa aa"; Dictionary<string, int> dic = WordCount(str); } static Dictionary<string, int> WordCount(string mes) { Dictionary<string, int> dic = new Dictionary<string, int>(); string[] arry = mes.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); foreach (var item in arry) { if (!dic.ContainsKey(item)) { dic.Add(item, 1); } else { dic[item] = dic[item] + 1; } } dic = dic.OrderByDescending(r => r.Value).ToDictionary(k => k.Key, v => v.Value); Dictionary<string, int> dicTemp = new Dictionary<string, int>(); foreach (var item in dic.Take(10)) { dicTemp.Add(item.Key, item.Value); } return dicTemp; } } }
2、自己练习了一下从大文件读取,统计单词重复次数
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Threading.Tasks.Dataflow; namespace readbigfile { class Program { static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>(); static Dictionary<string, int> dicAll = new Dictionary<string, int>(); static void Main(string[] args) { string fimename = @"C:\plan\Data1G.txt"; //文件比较大,起两个任务一个读一个处理 //启动读任务 Task t1 = Task.Factory.StartNew(() => ReadFile(fimename)); //启动处理任务 Task t2 = Task.Factory.StartNew(() => Process()); Task.WaitAll(t1,t2); //排序 Dictionary<string, int> dic = dicAll.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value); //返回出现次数最多的前十个单词及出现的次数 dic = dic.Take(10).ToDictionary(k => k.Key, v => v.Value); } //读文件 public static void ReadFile(string filename) { using (System.IO.FileStream fs = new System.IO.FileStream(filename, FileMode.Open, System.IO.FileAccess.Read)) { using (StreamReader sr = new StreamReader(fs)) { while(!sr.EndOfStream) { char[] charbuffer = new char[32 * 1024 * 1024]; //32M sr.ReadBlock(charbuffer, 0, charbuffer.Length); m_buffer.Post(new string(charbuffer)); System.Threading.Thread.Sleep(1000); } } } m_buffer.Complete(); } //处理,计算重复次数 private static async void Process() { string receive = string.Empty; while (await m_buffer.OutputAvailableAsync()) { m_buffer.TryReceive(out receive); if (string.IsNullOrEmpty(receive)) { return; } string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); foreach (var item in array) { if (!dicAll.ContainsKey(item)) { dicAll.Add(item, 1); } else { dicAll[item] = dicAll[item] + 1; } } } } } }
3、数据来源于多个文件,优化2,处理使用多task处理,测试结果处理时间与2相差不大,但是还是记录了一下
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Threading.Tasks.Dataflow; namespace readbigtext2 { class Program { static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>(); static void Main(string[] args) { //数据量1G //1、采用单线程,读取处理在一个线程,会outOfMewmory。 //2、两个线程,一个读取、一个处理,执行时间约14s //3、处理线程起10个task,经测试执行时间和一个task相差不大。 System.Diagnostics.Stopwatch st = new System.Diagnostics.Stopwatch(); st.Start(); string fimepath = @"C:\Users\xiaochun-zhai\Documents\bigtext"; Dictionary<string, int> dic = new Dictionary<string, int>(); Task t1 = new Task(() => ReadFile(fimepath)); t1.Start(); List<Task<Dictionary<string, int>>> _list = new List<Task<Dictionary<string, int>>>(); for (int i = 0; i < 10; i++) { _list.Add(Process()); } Task.WaitAll(_list.ToArray()); foreach (var item in _list) { foreach (var row in item.Result) { if (!dic.ContainsKey(row.Key)) dic.Add(row.Key, row.Value); else dic[row.Key] += row.Value; } } //排序 Dictionary<string, int> dicR = dic.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value); //返回出现次数最多的前十个单词及出现的次数 dicR = dicR.Take(10).ToDictionary(k => k.Key, v => v.Value); st.Stop(); Console.WriteLine(st.ElapsedMilliseconds); Console.ReadLine(); } public static void ReadFile(string filepath) { try { DirectoryInfo dif = new DirectoryInfo(filepath); FileInfo[] info=dif.GetFiles(); foreach (FileInfo item in info) { using (System.IO.FileStream fs = new System.IO.FileStream(item.FullName, FileMode.Open, System.IO.FileAccess.Read)) { using (StreamReader sr = new StreamReader(fs)) { while (!sr.EndOfStream) { char[] charbuffer = new char[32 * 1024]; //32M sr.ReadBlock(charbuffer, 0, charbuffer.Length); m_buffer.Post(new string(charbuffer).Trim()); } } Console.WriteLine(item.FullName); } } m_buffer.Complete(); } catch (Exception ex) { throw ex; } } private static async Task<Dictionary<string, int>> Process() { string receive = string.Empty; Dictionary<string, int> dicAll = new Dictionary<string, int>(); try { while (await m_buffer.OutputAvailableAsync()) { m_buffer.TryReceive(out receive); if (!string.IsNullOrEmpty(receive)) { string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); foreach (var item in array) { if (!dicAll.ContainsKey(item)) { dicAll.Add(item, 1); } else { dicAll[item] = dicAll[item] + 1; } } } } } catch(Exception ex) { throw ex; } return dicAll; } } }
4、如果文件比较大,不能一次读入内存,那就需要使用归并排序了。
5、扩展
统计大文件里,频率最高的10个单词,(C# TPL DataFlow版)