1、找出一段文字中出现次数最多的前10个单词以及次数

 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            string str = "wo men wo men wo ni hao ni hao we er ty ui o pp pp pp aa aa aa";
            Dictionary<string, int> dic = WordCount(str);
        }

        static Dictionary<string, int> WordCount(string mes)
        {
            Dictionary<string, int> dic = new Dictionary<string, int>();
            string[] arry = mes.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var item in arry)
            {
                if (!dic.ContainsKey(item))
                {
                    dic.Add(item, 1);
                }
                else
                {
                    dic[item] = dic[item] + 1;
                }
            }
            dic = dic.OrderByDescending(r => r.Value).ToDictionary(k => k.Key, v => v.Value);
            Dictionary<string, int> dicTemp = new Dictionary<string, int>();
            foreach (var item in dic.Take(10))
            {
                dicTemp.Add(item.Key, item.Value);
            }
            return dicTemp;
        }

       
    }
}

2、自己练习了一下从大文件读取,统计单词重复次数

 

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;

namespace readbigfile
{
    class Program
    {
        static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>();
        static Dictionary<string, int> dicAll = new Dictionary<string, int>();
        
        static void Main(string[] args)
        {
            string fimename = @"C:\plan\Data1G.txt";
            //文件比较大,起两个任务一个读一个处理
            //启动读任务
            Task t1 = Task.Factory.StartNew(() => ReadFile(fimename));
            //启动处理任务
            Task t2 = Task.Factory.StartNew(() => Process());
            Task.WaitAll(t1,t2);
            //排序
            Dictionary<string, int> dic = dicAll.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value);
            //返回出现次数最多的前十个单词及出现的次数
            dic = dic.Take(10).ToDictionary(k => k.Key, v => v.Value);
        }

        //读文件
        public static  void ReadFile(string filename)
        {
            using (System.IO.FileStream fs = new System.IO.FileStream(filename, FileMode.Open, System.IO.FileAccess.Read))
            {
                using (StreamReader sr = new StreamReader(fs))
                {
                    while(!sr.EndOfStream)
                    {
                        char[] charbuffer = new char[32 * 1024 * 1024];  //32M
                        sr.ReadBlock(charbuffer, 0, charbuffer.Length);
                        m_buffer.Post(new string(charbuffer));
                        System.Threading.Thread.Sleep(1000);
                    }
                }
            }
            m_buffer.Complete();
        }

        //处理,计算重复次数
        private static async void Process()
        {
            string receive = string.Empty;
            while (await m_buffer.OutputAvailableAsync())
            {
                m_buffer.TryReceive(out receive);
                if (string.IsNullOrEmpty(receive))
                {
                    return;
                }
                string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                foreach (var item in array)
                {
                    if (!dicAll.ContainsKey(item))
                    {
                        dicAll.Add(item, 1);
                    }
                    else
                    {
                        dicAll[item] = dicAll[item] + 1;
                    }
                }
            }
        }
    }
}

3、数据来源于多个文件,优化2,处理使用多task处理,测试结果处理时间与2相差不大,但是还是记录了一下

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;


namespace readbigtext2
{
    class Program
    {
        static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>();

        static void Main(string[] args)
        {
            //数据量1G
            //1、采用单线程,读取处理在一个线程,会outOfMewmory。
            //2、两个线程,一个读取、一个处理,执行时间约14s
            //3、处理线程起10个task,经测试执行时间和一个task相差不大。
            System.Diagnostics.Stopwatch st = new System.Diagnostics.Stopwatch();
            st.Start();
            string fimepath = @"C:\Users\xiaochun-zhai\Documents\bigtext";
            Dictionary<string, int> dic = new Dictionary<string, int>();
            Task t1 = new Task(() => ReadFile(fimepath));
            t1.Start();
            List<Task<Dictionary<string, int>>> _list = new List<Task<Dictionary<string, int>>>();
            for (int i = 0; i < 10; i++)
            {
                _list.Add(Process());
            }

            Task.WaitAll(_list.ToArray());

            foreach (var item in _list)
            {
                foreach (var row in item.Result)
                {
                    if (!dic.ContainsKey(row.Key))
                        dic.Add(row.Key, row.Value);
                    else
                        dic[row.Key] += row.Value;
                }
            }
             
            //排序
            Dictionary<string, int> dicR = dic.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value);
            //返回出现次数最多的前十个单词及出现的次数
            dicR = dicR.Take(10).ToDictionary(k => k.Key, v => v.Value);

            st.Stop();
            Console.WriteLine(st.ElapsedMilliseconds);
            
            Console.ReadLine();
        }
       
        public static void ReadFile(string filepath)
        {
            try
            {
                DirectoryInfo dif = new DirectoryInfo(filepath);
                FileInfo[] info=dif.GetFiles();
                foreach (FileInfo item in info)
                {
                    using (System.IO.FileStream fs = new System.IO.FileStream(item.FullName, FileMode.Open, System.IO.FileAccess.Read))
                    {
                        using (StreamReader sr = new StreamReader(fs))
                        {
                            while (!sr.EndOfStream)
                            {
                                char[] charbuffer = new char[32 * 1024];  //32M
                                sr.ReadBlock(charbuffer, 0, charbuffer.Length);
                                m_buffer.Post(new string(charbuffer).Trim());
                            }
                        }
                        Console.WriteLine(item.FullName);
                    }
                }
                m_buffer.Complete();
            }
            catch (Exception ex)
            {
                throw ex;
            }
           
        }

        private static async Task<Dictionary<string, int>> Process()
        {
           
            string receive = string.Empty;
            Dictionary<string, int> dicAll = new Dictionary<string, int>();
            try
            {
                while (await m_buffer.OutputAvailableAsync())
                {
                    m_buffer.TryReceive(out receive);
                    if (!string.IsNullOrEmpty(receive))
                    {
                        string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                        foreach (var item in array)
                        {
                            if (!dicAll.ContainsKey(item))
                            {
                                dicAll.Add(item, 1);
                            }
                            else
                            {
                                dicAll[item] = dicAll[item] + 1;
                            }
                        }
                    }
                }
            }
            catch(Exception ex)
            {
                throw ex;
            }
            return dicAll;
        }
    }
}

4、如果文件比较大,不能一次读入内存,那就需要使用归并排序了。

5、扩展

统计大文件里,频率最高的10个单词,(C# TPL DataFlow版)

 

posted on 2015-12-09 23:25  AmyAlisa  阅读(1971)  评论(0编辑  收藏  举报