DotnetSpider5 爬博客园新闻
只要是爬虫必须爬一下博客园.不知道为什么反正都这样..就跟hello world一样吧
DotnetSpider 是非常优秀的爬虫框架.无论扩展性 易用性 可读性. 已经跳进作者的坑4次了..DotnetSpider 现在版本是5 我是从2开始用的 最近打算跳入新坑
版本5的文档 https://github.com/dotnetcore/DotnetSpider/wiki
爬博客园其实作者是提供了Sample 不过比较简单
我这边为了跳新坑 重新改了下 对接了mysql
public class CnblogsSpider : Spider { public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder<CnblogsSpider>(); builder.UseSerilog(); builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>(); await builder.Build().RunAsync(); } public CnblogsSpider(IOptions<SpiderOptions> options, SpiderServices services, ILogger<Spider> logger) : base( options, services, logger) { } protected override async Task InitializeAsync(CancellationToken stoppingToken) { await AddRequestsAsync(new Request("https://news.cnblogs.com/n/666228/")); await AddRequestsAsync(new Request("https://news.cnblogs.com/n/page/1/")); AddDataFlow(new ListNewsParser()); AddDataFlow(new MysqlNewStorage()); } protected override (string Id, string Name) GetIdAndName() { return (Guid.NewGuid().ToString(), "cnblogs"); } protected class MysqlNewStorage : StorageBase { public override async Task InitAsync() { await using var conn = new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default")); //await conn.ExecuteAsync("create database if not exists cnblogs2;"); await conn.ExecuteAsync($@" create table if not exists article ( id int auto_increment primary key, title varchar(500) not null, sContent varchar(2000) null ); "); } protected override async Task StoreAsync(DataContext context) { var typeName = typeof(Article).FullName; var data = (Article)context.GetData(typeName); if (data != null && data is Article news) { await using var conn = new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default")); var icount = conn.Query<int>($"SELECT count(id) FROM article WHERE title = '{data.Title}'").FirstOrDefault(); if (icount <= 0) { await conn.ExecuteAsync( $"INSERT IGNORE INTO article (title, sContent) VALUES (@Title,@SContent);", data); } } } } protected class ListNewsParser : DataParser { public ListNewsParser() { // AddRequiredValidator("news\\.cnblogs\\.com/n/page"); AddRequiredValidator(request => { return Regex.IsMatch(request.RequestUri.ToString(), "news.cnblogs.com"); }); AddFollowRequestQuerier(Selectors.XPath(".")); //AddRequiredValidator("cnblogs.com"); // if you want to collect every pages // AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']")); } protected override Task Parse(DataContext context) { //var newsList = context.Selectable.SelectList(Selectors.XPath(".//div[@class='news_block']")); //if (newsList != null) //{ // foreach (var news in newsList) // { // var title = news.Select(Selectors.XPath(".//h2[@class='news_entry']"))?.Value; // var url = news.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href"))?.Value; // //var summary = news.Select(Selectors.XPath(".//div[@class='entry_summary']"))?.Value; // //var views = news.Select(Selectors.XPath(".//span[@class='view']"))?.Value.Replace(" 人浏览", ""); // if (!string.IsNullOrWhiteSpace(url)) // { // var request = context.CreateNewRequest(url); // //request.SetProperty("title", title); // //request.SetProperty("url", url); // //request.SetProperty("summary", summary); // //request.SetProperty("views", views); // context.AddFollowRequests(request); // } // } //} //var request = context.CreateNewRequest("http://baidu.com//"); //context.AddFollowRequests(request); var news_content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_main']")); if (news_content != null) { var title = news_content.Select(Selectors.XPath(".//div[@id='news_title']"))?.Value; var content = news_content.Select(Selectors.XPath(".//div[@id='news_content']"))?.Value; var typeName = typeof(Article).FullName; context.AddData(typeName, new Article { Title = title.Trim(), SContent = content.Trim(), //Summary = context.Request.Properties["summary"]?.Trim(), //Views = int.Parse(context.Request.Properties["views"]), //Content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_body']")).Value?.Trim() } ); } return Task.CompletedTask; } } public class Article { public string Title { get; set; } public string SContent { get; set; } } }
源码(https://files.cnblogs.com/files/leoxjy/ConsoleDotnetSpider5Sample.zip)