C#简单爬虫实现
一、环境
.net core 6.0
vs2022 控制台应用程序
Nuget引入:
AngleSharp 1.1.0 用于HTML解析
Downloader 3.0.6 用于下载文件
ShellProgressBar 5.2.0 用于进度条显示
二、效果
三、相关代码
1.Program.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | using ShellProgressBar; using Spider; using System.Collections; var url = "https://blog.csdn.net/u011127019/article/details/124248757" ; var data = await HttpHelper.GetHtmlDocument(url); DownloadHandler downloadHandler = new DownloadHandler(); List<ImageList> imageList = new List<ImageList>(); ImageList imageList1 = new ImageList { Name = "图片目录" , Images = new List< string >() }; foreach ( var item in data.QuerySelectorAll( "#article_content img" )) { var link = item.QuerySelector( "img" ); var href = item?.GetAttribute( "src" ); if (href != null ) { imageList1.ImageCount++; imageList1.Images.Add(href); } } imageList.Add(imageList1); var list = imageList; // 加载图集列表 ProgressBarOptions BarOptions = new () { ProgressCharacter = '─' , ProgressBarOnBottom = true , ForegroundColor = ConsoleColor.Yellow, ForegroundColorDone = ConsoleColor.DarkGreen, BackgroundColor = ConsoleColor.DarkGray, BackgroundCharacter = '\u2593' }; ProgressBarOptions ChildBarOptions = new () { ForegroundColor = ConsoleColor.Green, BackgroundColor = ConsoleColor.DarkGreen, ProgressCharacter = '─' }; using var bar = new ProgressBar(list.Count, "正在下载所有图片" , BarOptions); foreach ( var item in list) { bar.Message = $ "图集:{item.Name}" ; bar.Tick(); int i = 1; foreach ( var imgUrl in item.Images) { using ( var childBar = bar.Spawn(item.ImageCount, $ "图片:{imgUrl}" , ChildBarOptions)) { childBar.Tick(); string fileName = string .Empty; // 具体的下载代码 if (imgUrl.Contains( ".png" )) { fileName = ".png" ; } if (imgUrl.Contains( ".jpg" )) { fileName = ".jpg" ; } await downloadHandler.Download(childBar, imgUrl, AppDomain.CurrentDomain.BaseDirectory + "\\Images\\" + i + fileName); i++; } } } |
2.HttpHelper.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | using AngleSharp.Html.Dom; using AngleSharp.Html.Parser; using Downloader; using System.Net; using System.Text; namespace Spider { public static class HttpHelper { public const string UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" ; public static IDownloadService Downloader { get ; } public static DownloadConfiguration DownloadConf => new () { BufferBlockSize = 10240, // 通常,主机最大支持8000字节,默认值为8000。 ChunkCount = 8, // 要下载的文件分片数量,默认值为1 // MaximumBytesPerSecond = 1024 * 50, // 下载速度限制,默认值为零或无限制 MaxTryAgainOnFailover = 5, // 失败的最大次数 ParallelDownload = true , // 下载文件是否为并行的。默认值为false Timeout = 1000, // 每个 stream reader 的超时(毫秒),默认值是1000 RequestConfiguration = { Accept = "*/*" , AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, CookieContainer = new CookieContainer(), // Add your cookies Headers = new WebHeaderCollection(), // Add your custom headers KeepAlive = true , ProtocolVersion = HttpVersion.Version11, // Default value is HTTP 1.1 UseDefaultCredentials = false , UserAgent = UserAgent } }; public static HttpClientHandler Handler { get ; } public static HttpClient Client { get ; } static HttpHelper() { Handler = new HttpClientHandler(); Client = new HttpClient(Handler); Client.DefaultRequestHeaders.Add( "User-Agent" , UserAgent); Downloader = new DownloadService(DownloadConf); } public static async Task<IHtmlDocument> GetHtmlDocument( string url) { var html = await Client.GetStringAsync(url); return new HtmlParser().ParseDocument(html); } public static async Task<IHtmlDocument> GetHtmlDocument( string url, string charset) { var res = await Client.GetAsync(url); var resBytes = await res.Content.ReadAsByteArrayAsync(); var resStr = Encoding.GetEncoding(charset).GetString(resBytes); return new HtmlParser().ParseDocument(resStr); } } } |
3.DownloadHandler.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | using Downloader; using ShellProgressBar; using System; using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; using System.Linq; using System.Runtime.InteropServices; using System.Text; using System.Threading.Tasks; namespace Spider { public class DownloadHandler { public async Task Download(IProgressBar bar, string url, string filepath) { var barOptions = new ProgressBarOptions { ForegroundColor = ConsoleColor.Yellow, BackgroundColor = ConsoleColor.DarkYellow, ForegroundColorError = ConsoleColor.Red, ForegroundColorDone = ConsoleColor.Green, BackgroundCharacter = '\u2593' , ProgressBarOnBottom = true , EnableTaskBarProgress = RuntimeInformation.IsOSPlatform(OSPlatform.Windows), DisplayTimeInRealTime = false , ShowEstimatedDuration = false }; var percentageBar = bar.Spawn(100, $ "正在下载:{Path.GetFileName(url)}" , barOptions); HttpHelper.Downloader.DownloadStarted += DownloadStarted; HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted; HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged; await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath); void DownloadStarted( object ? sender, DownloadStartedEventArgs e) { Trace.WriteLine( $ "图片, FileName:{Path.GetFileName(e.FileName)}, TotalBytesToReceive:{e.TotalBytesToReceive}" ); } void DownloadFileCompleted( object ? sender, AsyncCompletedEventArgs e) { Trace.WriteLine($ "下载完成, filepath:{filepath}" ); percentageBar.Dispose(); } void DownloadProgressChanged( object ? sender, DownloadProgressChangedEventArgs e) { percentageBar.AsProgress< double >().Report(e.ProgressPercentage); } } } } |
4.Images.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Spider { public class ImageList { public string Name { get ; set ; } = string .Empty; public int ImageCount { get ; set ; } public List< string >? Images { get ; set ; } } } |
四、源码下载
链接:https://pan.baidu.com/s/1VnnH05Har9hUhxAsIfKSMw?pwd=paws
提取码:paws
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 【.NET】调用本地 Deepseek 模型
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
· DeepSeek “源神”启动!「GitHub 热点速览」
· 上周热点回顾(2.17-2.23)