关注领域建模

追随UML,学会思考

PLINQ实现Map/Reduce模式 学习

多线程编程实战 10.5 

很多年前就看到了map/reduce模式,但一直没有办法看到代码,只是知道采用并行技术对关键字排序什么的。理解了这个东东,才发现计算机处理真的非常快!!!

人类还没有看完文章,相关的统计信息估计就出来了。

另外,关于中文的处理还是需要一定的技巧的。

 

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

using Newtonsoft.Json;

using static System.Console;

namespace Chapter10.Recipe4
{
    class Program
    {
        static void Main(string[] args)
        {
            var booksList = new Dictionary<string, string>()
            {
                ["Moby Dick; Or, The Whale by Herman Melville"] 
                = "https://www.gutenberg.org/cache/epub/2701/pg2701.txt",

                ["The Adventures of Tom Sawyer by Mark Twain"]
                = "https://www.gutenberg.org/cache/epub/74/pg74.txt",

                ["Treasure Island by Robert Louis Stevenson"]
                = "https://www.gutenberg.org/cache/epub/120/pg120.txt",

                ["The Picture of Dorian Gray by Oscar Wilde"]
                = "https://www.gutenberg.org/cache/epub/174/pg174.txt"
            };

            HashSet<string> stopwords = DownloadStopWordsAsync().GetAwaiter().GetResult();

            var output = new StringBuilder();

            Parallel.ForEach(booksList.Keys, key =>
            {
                var bookContent = DownloadBookAsync(booksList[key])
                    .GetAwaiter().GetResult();

                string result = ProcessBookAsync(bookContent, key, stopwords)
                    .GetAwaiter().GetResult();

                output.Append(result);
                output.AppendLine();
            });

            Write(output.ToString());
            ReadLine();
        }

        static char[] delimiters = { ' ', ',', ';', ':', '\"', '.' };

        async static Task<string> ProcessBookAsync(
            string bookContent, string title, HashSet<string> stopwords)
        {
            using (var reader = new StringReader(bookContent))
            {
                var query = reader.EnumLines()
                    .AsParallel()
                    .SelectMany(line => line.Split(delimiters))
                    .MapReduce(
                        word => new[] { word.ToLower() },
                        key => key,
                        g => new[] { new { Word = g.Key, Count = g.Count() } }
                    )
                    .ToList();

                var words = query
                    .Where(element =>
                        !string.IsNullOrWhiteSpace(element.Word)
                        && !stopwords.Contains(element.Word))
                    .OrderByDescending(element => element.Count);
               
                var sb = new StringBuilder();

                sb.AppendLine($"'{title}' book stats");
                sb.AppendLine("Top ten words used in this book: ");
                foreach (var w in words.Take(10))
                {
                    sb.AppendLine($"Word: '{w.Word}', times used: '{w.Count}'");
                }

                sb.AppendLine($"Unique Words used: {query.Count()}");

                return sb.ToString();
            }
        }

        async static Task<string> DownloadBookAsync(string bookUrl)
        {
            //todo:指定请求包的安全协议,因为不知道你当前项目到底是哪个版本所以为了安全保障都加上
            ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3  | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12 ;
            using (var client = new HttpClient())
            {
                return  await client.GetStringAsync(bookUrl);
            }             
        }

        async static Task<HashSet<string>> DownloadStopWordsAsync()
        {
            string url = 
                "https://raw.githubusercontent.com/6/stopwords/master/stopwords-all.json";
            //todo:指定请求包的安全协议,因为不知道你当前项目到底是哪个版本所以为了安全保障都加上
            ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12 ;

            using (var client = new HttpClient())
            {
                try
                {
                    var content = await client.GetStringAsync(url);
                    var words =
                        JsonConvert.DeserializeObject
                            <Dictionary<string, string[]>>(content);
                    return new HashSet<string>(words["en"]);
                }
                catch
                {
                    return new HashSet<string>();
                }
                
            }
        }
    }

    static class Extensions
    {
        public static ParallelQuery<TResult> MapReduce<TSource, TMapped, TKey, TResult>(
            this ParallelQuery<TSource> source,
            Func<TSource, IEnumerable<TMapped>> map,
            Func<TMapped, TKey> keySelector,
            Func<IGrouping<TKey, TMapped>, IEnumerable<TResult>> reduce)
        {
            return source.SelectMany(map)
            .GroupBy(keySelector)
            .SelectMany(reduce);
        }

        public static IEnumerable<string> EnumLines(this StringReader reader)
        {
            while (true)
            {
                string line = reader.ReadLine();
                if (null == line) yield break;

                yield return line;
            }
        }
    }
}

 

posted on 2022-08-05 09:08  Beewolf  阅读(30)  评论(0编辑  收藏  举报

导航