.net HttpCrawler
using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace HttpCrawler { class Program { static void Main(string[] args) { Stopwatch sw = new Stopwatch(); sw.Start(); var titles = from row in GetHtml("http://bbs.csdn.net/forums/DotNET/").DocumentNode.SelectSingleNode("//table[@class='table_list parent_forum ']").Elements("tr").Skip(1) let td = row.Element("td") where td != null let a = td.Descendants("a").FirstOrDefault() where a != null select new { href = a.Attributes["href"].Value, text = a.InnerText }; var pages = from t in titles .AsParallel().WithDegreeOfParallelism(20) where t.href != null let path = "http://bbs.csdn.net" + t.href let subQuery = from nick in GetHtml(path).DocumentNode.SelectNodes("//span[@class='name2nick']") where nick.InnerText == "sp1234" select nick where subQuery.Any() select new { title = t.text, href = path }; var results = pages.ToList(); sw.Stop(); Console.WriteLine("不加并发的时间:"+sw.ElapsedMilliseconds); Console.ReadKey(); } static HtmlDocument GetHtml(string url) { var content = Encoding.UTF8.GetString(new WebClient().DownloadData(url)); var doc = new HtmlDocument(); doc.Load(new StringReader(content)); return doc; } } }