tdf sample

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading;

namespace GearUp.Crawler
{
    public class Crawler
    {
        private ILoreBookItemRepository repository;
        private ILorebookItemParser parser;
        private LinkManager linkManager;

        private string linkDomain;

        private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>();

        private const int DownloadTimeout = 10;

        public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
        {
            this.repository = repository;
            this.parser = parser;
            this.linkManager = linkManager;
        }

        public async void StartCrawl(string targetUrl)
        {
            var cts = new CancellationTokenSource();
            var ct = cts.Token;

            linkDomain = LinkManager.LinkDomain(targetUrl);

            var downloaderOptions = new ExecutionDataflowBlockOptions
            {
                MaxMessagesPerTask = 3,
                MaxDegreeOfParallelism = 4,
                BoundedCapacity = 10
            };

            var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions);

            var pipelineOptions = new ExecutionDataflowBlockOptions
            {
                MaxMessagesPerTask = 2,
                CancellationToken = ct
            };

            var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions);

            var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 });

            var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct });

            // Flow setup
            downloader.LinkTo(contentBroadcaster);
            contentBroadcaster.LinkTo(linkParser);
            contentBroadcaster.LinkTo(writer);
            linkParser.LinkTo(downloader);

            //Kick off the TPL dataflow here
            downloader.Post(targetUrl);
            WriteToConsole("Crawling...", ConsoleColor.Green);
            PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
            cts.Cancel();
            WriteToConsole("Stopping...", ConsoleColor.Green);
            await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion);

        }

        public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
        {
            if (page == null) return Enumerable.Empty<string>();

            var discoveredLinks = new List<string>();
            var document = new LorebookDocument(page.Html);
            foreach (var link in document.LinksInArticleBodyDiv())
            {
                var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
                if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
                    discoveredLinks.Add(fullUrl);
            }
            WriteToConsole("   {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
            return discoveredLinks;
        }

        public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
        {
            WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
            var itemDetails = document.OfficialLorebookEntry();
            var item = parser.ParseHtmlNode(itemDetails, url);
            return item;
        }

        public async Task<PageAndUrl> DownloadUrl(string url)
        {
            try
            {
                if (urls.ContainsKey(url)) return null;
                urls.TryAdd(url, true);

                var client = new WebClient();
                WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
                var download = client.DownloadStringTaskAsync(url);
                var cancel = Task.Delay(DownloadTimeout * 1000);
                var any = await Task.WhenAny(download, cancel);
                if (any == cancel)
                {
                    client.CancelAsync();
                    WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
                    return null;
                }
                string result = download.Result;

                WriteToConsole("Downloaded: {0}", ConsoleColor.White, url);

                return new PageAndUrl() { Url = url, Html = result };
            }

            catch (WebException ex)
            {
                WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
            }
            catch (AggregateException ex)
            {
                foreach (var exc in ex.Flatten().InnerExceptions)
                {
                    WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
                }
            }
            catch (Exception ex)
            {
                WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
            }

            return null;
        }

        public async Task SaveEntry(PageAndUrl page)
        {
            if (page == null) return;
            var document = new LorebookDocument(page.Html);
            var item = ExtractLoreBookItem(document, page.Url);
            if (item != null) await repository.Save(page.Url, item);
        }

        private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
        {
            Console.ForegroundColor = color;
            Console.WriteLine(format, texts);
            Console.ResetColor();
        }

        private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
        {
            WriteToConsole(message, color);
            if (key == null)
                Console.ReadLine();
            else
            {
                ConsoleKeyInfo entry;
                do
                {
                    entry = Console.ReadKey(true);
                } while (key != entry.Key);
            }
        }

    }
}

 

posted on 2015-04-11 20:25  武胜-阿伟  阅读(354)  评论(0编辑  收藏  举报