/// <summary> /// 数据库帮助类 /// </summary> public class MssqlHelper { #region 字段属性 /// <summary> /// 数据库连接字符串 /// </summary> private static string conn = "Data Source=.;Initial Catalog=Cnblogs;User ID=sa;Password=123"; #endregion #region DataTable写入数据 public static void GetData(string title, string url, string author, string time, string motto, string depth, DataTable dt) { DataRow dr; dr = dt.NewRow(); dr["BlogTitle"] = title; dr["BlogUrl"] = url; dr["BlogAuthor"] = author; dr["BlogTime"] = time; dr["BlogMotto"] = motto; dr["BlogDepth"] = depth; //2.0 将dr追加到dt中 dt.Rows.Add(dr); } #endregion #region 插入数据到数据库 /// <summary> /// 插入数据到数据库 /// </summary> public static void InsertDb(DataTable dt) { try { using (System.Data.SqlClient.SqlBulkCopy copy = new System.Data.SqlClient.SqlBulkCopy(conn)) { //3.0.1 指定数据插入目标表名称 copy.DestinationTableName = "BlogArticle"; //3.0.2 告诉SqlBulkCopy对象 内存表中的 OrderNO1和Userid1插入到OrderInfos表中的哪些列中 copy.ColumnMappings.Add("BlogTitle", "BlogTitle"); copy.ColumnMappings.Add("BlogUrl", "BlogUrl"); copy.ColumnMappings.Add("BlogAuthor", "BlogAuthor"); copy.ColumnMappings.Add("BlogTime", "BlogTime"); copy.ColumnMappings.Add("BlogMotto", "BlogMotto"); copy.ColumnMappings.Add("BlogDepth", "BlogDepth"); //3.0.3 将内存表dt中的数据一次性批量插入到OrderInfos表中 copy.WriteToServer(dt); dt.Rows.Clear(); } } catch (Exception) { dt.Rows.Clear(); } } #endregion }
/// <summary> /// 日志帮助类 /// </summary> public class LogHelper { #region 写入日志 //写入日志 public static void WriteLog(string text) { //StreamWriter sw = new StreamWriter(AppDomain.CurrentDomain.BaseDirectory + "\\log.txt", true); StreamWriter sw = new StreamWriter("F:" + "\\log.txt", true); sw.WriteLine(text); sw.Close();//写入 } #endregion }

namespace Feng.SimpleCrawler { using System; /// <summary> /// The add url event handler. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> public delegate bool AddUrlEventHandler(AddUrlEventArgs args); /// <summary> /// The add url event args. /// </summary> public class AddUrlEventArgs : EventArgs { #region Public Properties /// <summary> /// Gets or sets the depth. /// </summary> public int Depth { get; set; } /// <summary> /// Gets or sets the title. /// </summary> public string Title { get; set; } /// <summary> /// Gets or sets the url. /// </summary> public string Url { get; set; } #endregion } }

namespace Feng.SimpleCrawler { using System; using System.Collections; /// <summary> /// The bloom filter. /// </summary> /// <typeparam name="T"> /// The generic type. /// </typeparam> public class BloomFilter<T> { #region Fields /// <summary> /// The get hash secondary. /// </summary> private readonly HashFunction getHashSecondary; /// <summary> /// The hash bits. /// </summary> private readonly BitArray hashBits; /// <summary> /// The hash function count. /// </summary> private readonly int hashFunctionCount; #endregion #region Constructors and Destructors /// <summary> /// Initializes a new instance of the <see cref="BloomFilter{T}"/> class. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> public BloomFilter(int capacity) : this(capacity, null) { } /// <summary> /// Initializes a new instance of the <see cref="BloomFilter{T}"/> class. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="errorRate"> /// The error rate. /// </param> public BloomFilter(int capacity, int errorRate) : this(capacity, errorRate, null) { } /// <summary> /// Initializes a new instance of the <see cref="BloomFilter{T}"/> class. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="hashFunction"> /// The hash function. /// </param> public BloomFilter(int capacity, HashFunction hashFunction) : this(capacity, BestErrorRate(capacity), hashFunction) { } /// <summary> /// Initializes a new instance of the <see cref="BloomFilter{T}"/> class. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="errorRate"> /// The error rate. /// </param> /// <param name="hashFunction"> /// The hash function. /// </param> public BloomFilter(int capacity, float errorRate, HashFunction hashFunction) : this(capacity, errorRate, hashFunction, BestM(capacity, errorRate), BestK(capacity, errorRate)) { } /// <summary> /// Initializes a new instance of the <see cref="BloomFilter{T}"/> class. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="errorRate"> /// The error rate. /// </param> /// <param name="hashFunction"> /// The hash function. /// </param> /// <param name="m"> /// The m. /// </param> /// <param name="k"> /// The k. /// </param> public BloomFilter(int capacity, float errorRate, HashFunction hashFunction, int m, int k) { if (capacity < 1) { throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0"); } if (errorRate >= 1 || errorRate <= 0) { throw new ArgumentOutOfRangeException( "errorRate", errorRate, string.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate)); } if (m < 1) { throw new ArgumentOutOfRangeException( string.Format( "The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate)); } if (hashFunction == null) { if (typeof(T) == typeof(string)) { this.getHashSecondary = HashString; } else if (typeof(T) == typeof(int)) { this.getHashSecondary = HashInt32; } else { throw new ArgumentNullException( "hashFunction", "Please provide a hash function for your type T, when T is not a string or int."); } } else { this.getHashSecondary = hashFunction; } this.hashFunctionCount = k; this.hashBits = new BitArray(m); } #endregion #region Delegates /// <summary> /// The hash function. /// </summary> /// <param name="input"> /// The input. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> public delegate int HashFunction(T input); #endregion #region Public Properties /// <summary> /// Gets the truthiness. /// </summary> public double Truthiness { get { return (double)this.TrueBits() / this.hashBits.Count; } } #endregion #region Public Methods and Operators /// <summary> /// The add. /// </summary> /// <param name="item"> /// The item. /// </param> public void Add(T item) { int primaryHash = item.GetHashCode(); int secondaryHash = this.getHashSecondary(item); for (int i = 0; i < this.hashFunctionCount; i++) { int hash = this.ComputeHash(primaryHash, secondaryHash, i); this.hashBits[hash] = true; } } /// <summary> /// The contains. /// </summary> /// <param name="item"> /// The item. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> public bool Contains(T item) { int primaryHash = item.GetHashCode(); int secondaryHash = this.getHashSecondary(item); for (int i = 0; i < this.hashFunctionCount; i++) { int hash = this.ComputeHash(primaryHash, secondaryHash, i); if (this.hashBits[hash] == false) { return false; } } return true; } #endregion #region Methods /// <summary> /// The best error rate. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <returns> /// The <see cref="float"/>. /// </returns> private static float BestErrorRate(int capacity) { var c = (float)(1.0 / capacity); if (Math.Abs(c) > 0) { return c; } double y = int.MaxValue / (double)capacity; return (float)Math.Pow(0.6185, y); } /// <summary> /// The best k. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="errorRate"> /// The error rate. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> private static int BestK(int capacity, float errorRate) { return (int)Math.Round(Math.Log(2.0) * BestM(capacity, errorRate) / capacity); } /// <summary> /// The best m. /// </summary> /// <param name="capacity"> /// The capacity. /// </param> /// <param name="errorRate"> /// The error rate. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> private static int BestM(int capacity, float errorRate) { return (int)Math.Ceiling(capacity * Math.Log(errorRate, 1.0 / Math.Pow(2, Math.Log(2.0)))); } /// <summary> /// The hash int 32. /// </summary> /// <param name="input"> /// The input. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> private static int HashInt32(T input) { var x = input as uint?; unchecked { x = ~x + (x << 15); x = x ^ (x >> 12); x = x + (x << 2); x = x ^ (x >> 4); x = x * 2057; x = x ^ (x >> 16); return (int)x; } } /// <summary> /// The hash string. /// </summary> /// <param name="input"> /// The input. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> private static int HashString(T input) { var str = input as string; int hash = 0; if (str != null) { for (int i = 0; i < str.Length; i++) { hash += str[i]; hash += hash << 10; hash ^= hash >> 6; } hash += hash << 3; hash ^= hash >> 11; hash += hash << 15; } return hash; } /// <summary> /// The compute hash. /// </summary> /// <param name="primaryHash"> /// The primary hash. /// </param> /// <param name="secondaryHash"> /// The secondary hash. /// </param> /// <param name="i"> /// The i. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> private int ComputeHash(int primaryHash, int secondaryHash, int i) { int resultingHash = (primaryHash + (i * secondaryHash)) % this.hashBits.Count; return Math.Abs(resultingHash); } /// <summary> /// The true bits. /// </summary> /// <returns> /// The <see cref="int"/>. /// </returns> private int TrueBits() { int output = 0; foreach (bool bit in this.hashBits) { if (bit) { output++; } } return output; } #endregion } }

namespace Feng.SimpleCrawler { using System; /// <summary> /// The crawl error event handler. /// </summary> /// <param name="args"> /// The args. /// </param> public delegate void CrawlErrorEventHandler(CrawlErrorEventArgs args); /// <summary> /// The crawl error event args. /// </summary> public class CrawlErrorEventArgs : EventArgs { #region Public Properties /// <summary> /// Gets or sets the exception. /// </summary> public Exception Exception { get; set; } /// <summary> /// Gets or sets the url. /// </summary> public string Url { get; set; } #endregion } }

namespace Feng.SimpleCrawler { using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading; /// <summary> /// The crawl master. /// </summary> public class CrawlMaster { #region Constants /// <summary> /// The web url regular expressions. /// </summary> private const string WebUrlRegularExpressions = @"^(http|https)://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; #endregion #region Fields /// <summary> /// The cookie container. /// </summary> private readonly CookieContainer cookieContainer; /// <summary> /// The random. /// </summary> private readonly Random random; /// <summary> /// The thread status. /// </summary> private readonly bool[] threadStatus; /// <summary> /// The threads. /// </summary> private readonly Thread[] threads; #endregion #region Constructors and Destructors /// <summary> /// Initializes a new instance of the <see cref="CrawlMaster"/> class. /// </summary> /// <param name="settings"> /// The settings. /// </param> public CrawlMaster(CrawlSettings settings) { this.cookieContainer = new CookieContainer(); this.random = new Random(); this.Settings = settings; this.threads = new Thread[settings.ThreadCount]; this.threadStatus = new bool[settings.ThreadCount]; } #endregion #region Public Events /// <summary> /// The add url event. /// </summary> public event AddUrlEventHandler AddUrlEvent; /// <summary> /// The crawl error event. /// </summary> public event CrawlErrorEventHandler CrawlErrorEvent; /// <summary> /// The data received event. /// </summary> public event DataReceivedEventHandler DataReceivedEvent; #endregion #region Public Properties /// <summary> /// Gets the settings. /// </summary> public CrawlSettings Settings { get; private set; } #endregion #region Public Methods and Operators /// <summary> /// The crawl. /// </summary> public void Crawl() { this.Initialize(); for (int i = 0; i < this.threads.Length; i++) { this.threads[i].Start(i); this.threadStatus[i] = false; } } /// <summary> /// The stop. /// </summary> public void Stop() { foreach (Thread thread in this.threads) { thread.Abort(); } } #endregion #region Methods /// <summary> /// The config request. /// </summary> /// <param name="request"> /// The request. /// </param> private void ConfigRequest(HttpWebRequest request) { request.UserAgent = this.Settings.UserAgent; request.CookieContainer = this.cookieContainer; request.AllowAutoRedirect = true; request.MediaType = "text/html"; request.Headers["Accept-Language"] = "zh-CN,zh;q=0.8"; if (this.Settings.Timeout > 0) { request.Timeout = this.Settings.Timeout; } } /// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; this.ConfigRequest(request); if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) { this.PersistenceCookie(response); Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); this.ParseLinks(urlInfo, html); if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html }); } if (stream != null) { stream.Close(); } } } } catch (Exception exception) { if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent( new CrawlErrorEventArgs { Url = urlInfo.UrlString, Exception = exception }); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } } /// <summary> /// The initialize. /// </summary> private void Initialize() { if (this.Settings.SeedsAddress != null && this.Settings.SeedsAddress.Count > 0) { foreach (string seed in this.Settings.SeedsAddress) { if (Regex.IsMatch(seed, WebUrlRegularExpressions, RegexOptions.IgnoreCase)) { UrlQueue.Instance.EnQueue(new UrlInfo(seed) { Depth = 1 }); } } } for (int i = 0; i < this.Settings.ThreadCount; i++) { var threadStart = new ParameterizedThreadStart(this.CrawlProcess); this.threads[i] = new Thread(threadStart); } ServicePointManager.DefaultConnectionLimit = 256; } /// <summary> /// The is match regular. /// </summary> /// <param name="url"> /// The url. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private bool IsMatchRegular(string url) { bool result = false; if (this.Settings.RegularFilterExpressions != null && this.Settings.RegularFilterExpressions.Count > 0) { if ( this.Settings.RegularFilterExpressions.Any( pattern => Regex.IsMatch(url, pattern, RegexOptions.IgnoreCase))) { result = true; } } else { result = true; } return result; } /// <summary> /// The parse content. /// </summary> /// <param name="stream"> /// The stream. /// </param> /// <param name="characterSet"> /// The character set. /// </param> /// <returns> /// The <see cref="string"/>. /// </returns> private string ParseContent(Stream stream, string characterSet) { var memoryStream = new MemoryStream(); stream.CopyTo(memoryStream); byte[] buffer = memoryStream.ToArray(); Encoding encode = Encoding.ASCII; string html = encode.GetString(buffer); string localCharacterSet = characterSet; Match match = Regex.Match(html, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase); if (match.Success) { localCharacterSet = match.Groups[2].Value; var stringBuilder = new StringBuilder(); foreach (char item in localCharacterSet) { if (item == ' ') { break; } if (item != '\"') { stringBuilder.Append(item); } } localCharacterSet = stringBuilder.ToString(); } if (string.IsNullOrEmpty(localCharacterSet)) { localCharacterSet = characterSet; } if (!string.IsNullOrEmpty(localCharacterSet)) { encode = Encoding.GetEncoding(localCharacterSet); } memoryStream.Close(); return encode.GetString(buffer); } /// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary<string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如 和 if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } } /// <summary> /// The persistence cookie. /// </summary> /// <param name="response"> /// The response. /// </param> private void PersistenceCookie(HttpWebResponse response) { if (!this.Settings.KeepCookie) { return; } string cookies = response.Headers["Set-Cookie"]; if (!string.IsNullOrEmpty(cookies)) { var cookieUri = new Uri( string.Format( "{0}://{1}:{2}/", response.ResponseUri.Scheme, response.ResponseUri.Host, response.ResponseUri.Port)); this.cookieContainer.SetCookies(cookieUri, cookies); } } #endregion } }

namespace Feng.SimpleCrawler { using System; using System.Collections.Generic; /// <summary> /// The crawl settings. /// </summary> [Serializable] public class CrawlSettings { #region Fields /// <summary> /// The depth. /// </summary> private byte depth = 3; /// <summary> /// The lock host. /// </summary> private bool lockHost = true; /// <summary> /// The thread count. /// </summary> private byte threadCount = 1; /// <summary> /// The timeout. /// </summary> private int timeout = 15000; /// <summary> /// The user agent. /// </summary> private string userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"; #endregion #region Constructors and Destructors /// <summary> /// Initializes a new instance of the <see cref="CrawlSettings"/> class. /// </summary> public CrawlSettings() { this.AutoSpeedLimit = false; this.EscapeLinks = new List<string>(); this.KeepCookie = true; this.HrefKeywords = new List<string>(); this.LockHost = true; this.RegularFilterExpressions = new List<string>(); this.SeedsAddress = new List<string>(); } #endregion #region Public Properties /// <summary> /// Gets or sets a value indicating whether auto speed limit. /// </summary> public bool AutoSpeedLimit { get; set; } /// <summary> /// Gets or sets the depth. /// </summary> public byte Depth { get { return this.depth; } set { this.depth = value; } } /// <summary> /// Gets the escape links. /// </summary> public List<string> EscapeLinks { get; private set; } /// <summary> /// Gets or sets a value indicating whether keep cookie. /// </summary> public bool KeepCookie { get; set; } /// <summary> /// Gets the href keywords. /// </summary> public List<string> HrefKeywords { get; private set; } /// <summary> /// Gets or sets a value indicating whether lock host. /// </summary> public bool LockHost { get { return this.lockHost; } set { this.lockHost = value; } } /// <summary> /// Gets the regular filter expressions. /// </summary> public List<string> RegularFilterExpressions { get; private set; } /// <summary> /// Gets the seeds address. /// </summary> public List<string> SeedsAddress { get; private set; } /// <summary> /// Gets or sets the thread count. /// </summary> public byte ThreadCount { get { return this.threadCount; } set { this.threadCount = value; } } /// <summary> /// Gets or sets the timeout. /// </summary> public int Timeout { get { return this.timeout; } set { this.timeout = value; } } /// <summary> /// Gets or sets the user agent. /// </summary> public string UserAgent { get { return this.userAgent; } set { this.userAgent = value; } } #endregion } }

namespace Feng.SimpleCrawler { /// <summary> /// The crawl status. /// </summary> public enum CrawlStatus { /// <summary> /// The completed. /// </summary> Completed = 1, /// <summary> /// The never been. /// </summary> NeverBeen = 2 } }

namespace Feng.SimpleCrawler { using System; /// <summary> /// The data received event handler. /// </summary> /// <param name="args"> /// The args. /// </param> public delegate void DataReceivedEventHandler(DataReceivedEventArgs args); /// <summary> /// The data received event args. /// </summary> public class DataReceivedEventArgs : EventArgs { #region Public Properties /// <summary> /// Gets or sets the depth. /// </summary> public int Depth { get; set; } /// <summary> /// Gets or sets the html. /// </summary> public string Html { get; set; } /// <summary> /// Gets or sets the url. /// </summary> public string Url { get; set; } #endregion } }

namespace Feng.SimpleCrawler { using System.Collections.Generic; using System.Threading; /// <summary> /// The security queue. /// </summary> /// <typeparam name="T"> /// Any type. /// </typeparam> public abstract class SecurityQueue<T> where T : class { #region Fields /// <summary> /// The inner queue. /// </summary> protected readonly Queue<T> InnerQueue = new Queue<T>(); /// <summary> /// The sync object. /// </summary> protected readonly object SyncObject = new object(); /// <summary> /// The auto reset event. /// </summary> private readonly AutoResetEvent autoResetEvent; #endregion #region Constructors and Destructors /// <summary> /// Initializes a new instance of the <see cref="SecurityQueue{T}"/> class. /// </summary> protected SecurityQueue() { this.autoResetEvent = new AutoResetEvent(false); } #endregion #region Delegates /// <summary> /// The before en queue event handler. /// </summary> /// <param name="target"> /// The target. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> public delegate bool BeforeEnQueueEventHandler(T target); #endregion #region Public Events /// <summary> /// The before en queue event. /// </summary> public event BeforeEnQueueEventHandler BeforeEnQueueEvent; #endregion #region Public Properties /// <summary> /// Gets the auto reset event. /// </summary> public AutoResetEvent AutoResetEvent { get { return this.autoResetEvent; } } /// <summary> /// Gets the count. /// </summary> public int Count { get { lock (this.SyncObject) { return this.InnerQueue.Count; } } } /// <summary> /// Gets a value indicating whether has value. /// </summary> public bool HasValue { get { return this.Count != 0; } } #endregion #region Public Methods and Operators /// <summary> /// The de queue. /// </summary> /// <returns> /// The <see cref="T"/>. /// </returns> public T DeQueue() { lock (this.SyncObject) { if (this.InnerQueue.Count > 0) { return this.InnerQueue.Dequeue(); } return default(T); } } /// <summary> /// The en queue. /// </summary> /// <param name="target"> /// The target. /// </param> public void EnQueue(T target) { lock (this.SyncObject) { if (this.BeforeEnQueueEvent != null) { if (this.BeforeEnQueueEvent(target)) { this.InnerQueue.Enqueue(target); } } else { this.InnerQueue.Enqueue(target); } this.AutoResetEvent.Set(); } } #endregion } }

namespace Feng.SimpleCrawler { /// <summary> /// The url info. /// </summary> public class UrlInfo { #region Fields /// <summary> /// The url. /// </summary> private readonly string url; #endregion #region Constructors and Destructors /// <summary> /// Initializes a new instance of the <see cref="UrlInfo"/> class. /// </summary> /// <param name="urlString"> /// The url string. /// </param> public UrlInfo(string urlString) { this.url = urlString; } #endregion #region Public Properties /// <summary> /// Gets or sets the depth. /// </summary> public int Depth { get; set; } /// <summary> /// Gets the url string. /// </summary> public string UrlString { get { return this.url; } } /// <summary> /// Gets or sets the status. /// </summary> public CrawlStatus Status { get; set; } #endregion } }

namespace Feng.SimpleCrawler { /// <summary> /// The url queue. /// </summary> public class UrlQueue : SecurityQueue<UrlInfo> { #region Constructors and Destructors /// <summary> /// Prevents a default instance of the <see cref="UrlQueue"/> class from being created. /// </summary> private UrlQueue() { } #endregion #region Public Properties /// <summary> /// Gets the instance. /// </summary> public static UrlQueue Instance { get { return Nested.Inner; } } #endregion /// <summary> /// The nested. /// </summary> private static class Nested { #region Static Fields /// <summary> /// The inner. /// </summary> internal static readonly UrlQueue Inner = new UrlQueue(); #endregion } } }
using Feng.SimpleCrawler; using Feng.DbHelper; using Feng.Log; using HtmlAgilityPack; namespace Feng.Demo { /// <summary> /// windows服务 /// </summary> partial class FengCnblogsService : ServiceBase { #region 构造函数 /// <summary> /// 构造函数 /// </summary> public FengCnblogsService() { InitializeComponent(); } #endregion #region 字段属性 /// <summary> /// 蜘蛛爬虫的设置 /// </summary> private static readonly CrawlSettings Settings = new CrawlSettings(); /// <summary> /// 临时内存表存储数据 /// </summary> private static DataTable dt = new DataTable(); /// <summary> /// 关于 Filter URL: /// </summary> private static BloomFilter<string> filter; #endregion #region 启动服务 /// <summary> /// TODO: 在此处添加代码以启动服务。 /// </summary> /// <param name="args"></param> protected override void OnStart(string[] args) { ProcessStart(); } #endregion #region 停止服务 /// <summary> /// TODO: 在此处添加代码以执行停止服务所需的关闭操作。 /// </summary> protected override void OnStop() { } #endregion #region 程序开始处理 /// <summary> /// 程序开始处理 /// </summary> private void ProcessStart() { dt.Columns.Add("BlogTitle", typeof(string)); dt.Columns.Add("BlogUrl", typeof(string)); dt.Columns.Add("BlogAuthor", typeof(string)); dt.Columns.Add("BlogTime", typeof(string)); dt.Columns.Add("BlogMotto", typeof(string)); dt.Columns.Add("BlogDepth", typeof(string)); filter = new BloomFilter<string>(200000); const string CityName = ""; #region 设置种子地址 // 设置种子地址 Settings.SeedsAddress.Add(string.Format("{0}", CityName)); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); Settings.SeedsAddress.Add(""); #endregion #region 设置 URL 关键字 Settings.HrefKeywords.Add("a/"); Settings.HrefKeywords.Add("b/"); Settings.HrefKeywords.Add("c/"); Settings.HrefKeywords.Add("d/"); Settings.HrefKeywords.Add("e/"); Settings.HrefKeywords.Add("f/"); Settings.HrefKeywords.Add("g/"); Settings.HrefKeywords.Add("h/"); Settings.HrefKeywords.Add("i/"); Settings.HrefKeywords.Add("j/"); Settings.HrefKeywords.Add("k/"); Settings.HrefKeywords.Add("l/"); Settings.HrefKeywords.Add("m/"); Settings.HrefKeywords.Add("n/"); Settings.HrefKeywords.Add("o/"); Settings.HrefKeywords.Add("p/"); Settings.HrefKeywords.Add("q/"); Settings.HrefKeywords.Add("r/"); Settings.HrefKeywords.Add("s/"); Settings.HrefKeywords.Add("t/"); Settings.HrefKeywords.Add("u/"); Settings.HrefKeywords.Add("v/"); Settings.HrefKeywords.Add("w/"); Settings.HrefKeywords.Add("x/"); Settings.HrefKeywords.Add("y/"); Settings.HrefKeywords.Add("z/"); #endregion // 设置爬取线程个数 Settings.ThreadCount = 1; // 设置爬取深度 Settings.Depth = 55; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(""); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 Settings.LockHost = false; Settings.RegularFilterExpressions.Add(@"http://([w]{3}.)+[cnblogs]"); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); } #endregion #region 打印Url /// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url)) { filter.Add(args.Url); Console.WriteLine(args.Url); if (dt.Rows.Count > 200) { MssqlHelper.InsertDb(dt); dt.Rows.Clear(); } return true; } return false; // 返回 false 代表:不添加到队列中 } #endregion #region 解析HTML /// <summary> /// The master data received event. /// </summary> /// <param name="args"> /// The args. /// </param> private static void MasterDataReceivedEvent(SimpleCrawler.DataReceivedEventArgs args) { // 在此处解析页面,可以用类似于 HtmlAgilityPack(页面解析组件)的东东、也可以用正则表达式、还可以自己进行字符串分析 HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(args.Html); HtmlNode node = doc.DocumentNode.SelectSingleNode("//title"); string title = node.InnerText; HtmlNode node2 = doc.DocumentNode.SelectSingleNode("//*[@id='post-date']"); string time = node2.InnerText; HtmlNode node3 = doc.DocumentNode.SelectSingleNode("//*[@id='topics']/div/div[3]/a[1]"); string author = node3.InnerText; HtmlNode node6 = doc.DocumentNode.SelectSingleNode("//*[@id='blogTitle']/h2"); string motto = node6.InnerText; MssqlHelper.GetData(title, args.Url, author, time, motto, args.Depth.ToString(), dt); LogHelper.WriteLog(title); LogHelper.WriteLog(args.Url); LogHelper.WriteLog(author); LogHelper.WriteLog(time); LogHelper.WriteLog(motto == "" ? "null" : motto); LogHelper.WriteLog(args.Depth + "&dt.Rows.Count=" + dt.Rows.Count); //每次超过100条数据就存入数据库,可以根据自己的情况设置数量 if (dt.Rows.Count > 100) { MssqlHelper.InsertDb(dt); dt.Rows.Clear(); } } #endregion } }
