PLINQ实现Map/Reduce模式 学习
多线程编程实战 10.5
很多年前就看到了map/reduce模式,但一直没有办法看到代码,只是知道采用并行技术对关键字排序什么的。理解了这个东东,才发现计算机处理真的非常快!!!
人类还没有看完文章,相关的统计信息估计就出来了。
另外,关于中文的处理还是需要一定的技巧的。
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Net.Http; using System.Text; using System.Threading.Tasks; using Newtonsoft.Json; using static System.Console; namespace Chapter10.Recipe4 { class Program { static void Main(string[] args) { var booksList = new Dictionary<string, string>() { ["Moby Dick; Or, The Whale by Herman Melville"] = "https://www.gutenberg.org/cache/epub/2701/pg2701.txt", ["The Adventures of Tom Sawyer by Mark Twain"] = "https://www.gutenberg.org/cache/epub/74/pg74.txt", ["Treasure Island by Robert Louis Stevenson"] = "https://www.gutenberg.org/cache/epub/120/pg120.txt", ["The Picture of Dorian Gray by Oscar Wilde"] = "https://www.gutenberg.org/cache/epub/174/pg174.txt" }; HashSet<string> stopwords = DownloadStopWordsAsync().GetAwaiter().GetResult(); var output = new StringBuilder(); Parallel.ForEach(booksList.Keys, key => { var bookContent = DownloadBookAsync(booksList[key]) .GetAwaiter().GetResult(); string result = ProcessBookAsync(bookContent, key, stopwords) .GetAwaiter().GetResult(); output.Append(result); output.AppendLine(); }); Write(output.ToString()); ReadLine(); } static char[] delimiters = { ' ', ',', ';', ':', '\"', '.' }; async static Task<string> ProcessBookAsync( string bookContent, string title, HashSet<string> stopwords) { using (var reader = new StringReader(bookContent)) { var query = reader.EnumLines() .AsParallel() .SelectMany(line => line.Split(delimiters)) .MapReduce( word => new[] { word.ToLower() }, key => key, g => new[] { new { Word = g.Key, Count = g.Count() } } ) .ToList(); var words = query .Where(element => !string.IsNullOrWhiteSpace(element.Word) && !stopwords.Contains(element.Word)) .OrderByDescending(element => element.Count); var sb = new StringBuilder(); sb.AppendLine($"'{title}' book stats"); sb.AppendLine("Top ten words used in this book: "); foreach (var w in words.Take(10)) { sb.AppendLine($"Word: '{w.Word}', times used: '{w.Count}'"); } sb.AppendLine($"Unique Words used: {query.Count()}"); return sb.ToString(); } } async static Task<string> DownloadBookAsync(string bookUrl) { //todo:指定请求包的安全协议,因为不知道你当前项目到底是哪个版本所以为了安全保障都加上 ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12 ; using (var client = new HttpClient()) { return await client.GetStringAsync(bookUrl); } } async static Task<HashSet<string>> DownloadStopWordsAsync() { string url = "https://raw.githubusercontent.com/6/stopwords/master/stopwords-all.json"; //todo:指定请求包的安全协议,因为不知道你当前项目到底是哪个版本所以为了安全保障都加上 ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12 ; using (var client = new HttpClient()) { try { var content = await client.GetStringAsync(url); var words = JsonConvert.DeserializeObject <Dictionary<string, string[]>>(content); return new HashSet<string>(words["en"]); } catch { return new HashSet<string>(); } } } } static class Extensions { public static ParallelQuery<TResult> MapReduce<TSource, TMapped, TKey, TResult>( this ParallelQuery<TSource> source, Func<TSource, IEnumerable<TMapped>> map, Func<TMapped, TKey> keySelector, Func<IGrouping<TKey, TMapped>, IEnumerable<TResult>> reduce) { return source.SelectMany(map) .GroupBy(keySelector) .SelectMany(reduce); } public static IEnumerable<string> EnumLines(this StringReader reader) { while (true) { string line = reader.ReadLine(); if (null == line) yield break; yield return line; } } } }