C# html数据爬取与过滤
1.首先安装第三方HTML数据过滤包 HtmlAgilityPack
我爬取的网站是一个树洞网站:https://i.jandan.net/treehole,他是一个单体网站,不通过api请求,所以只能根据HTML过滤,他的分页是通过base64加密的
这是获取到的部分数据,这是我们需要的有效数据,他是有固定结构的,我们只要筛选这里面的数据显示出来就好了
以下是所有代码
using HtmlAgilityPack; using System.Text; using System.Text.RegularExpressions; await ReadHtml(); while (true) { Console.WriteLine("请输入页码:"); var pageNumByNowDate = Console.ReadLine(); if (!int.TryParse(pageNumByNowDate, out int pageNum)) { Console.WriteLine("页码格式不正确,请重新输入数字"); continue; } Console.Clear(); byte[] buffer = Encoding.UTF8.GetBytes(DateTime.Now.ToString("yyyyMMdd") + "-" + pageNum); await ReadHtml("/" + Convert.ToBase64String(buffer)); } //读取html async Task ReadHtml(string url = "") { var client = new HttpClient(); var request = new HttpRequestMessage(HttpMethod.Get, "https://i.jandan.net/treehole" + url); request.Headers.Add("Cookie", "PHPSESSID=38e64nulb56bqgl6e27b5sp31l"); var response = await client.SendAsync(request); response.EnsureSuccessStatusCode(); ClearData(await response.Content.ReadAsStringAsync()); } //清洗脏数据 void ClearData(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var commentTexts = doc.DocumentNode.SelectNodes("//div[@class='commenttext']"); foreach (var comment in commentTexts) { var commonText = comment.InnerText.Length > 30 ? string.Join("", comment.InnerText.Select((c, i) => (i > 0 && i % 50 == 0) ? $"{c}\r\n" : c.ToString().Trim().Replace("<br/>", "").Replace("<br />", ""))) : comment.InnerText.Trim().Replace("<br/>", "").Replace("<br />", ""); var childrens = comment.ParentNode.SelectNodes(".//span[@class='tucao-unlike-container']"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(commentTexts.IndexOf(comment) + 1 + "、 " + commonText); Console.WriteLine(); Console.WriteLine(string.Join(" ", childrens.Select(c=>c.InnerText.Trim()))); Console.WriteLine("_____________________________________________________________"); } }
输出效果