HtmlAgilityPack Sample
通过html Table获取到内部数据,并执行去重.
HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb(); //加载本地文件 (之前是通过System.Net.Http.HttpClient post采集到的) HtmlAgilityPack.HtmlDocument doc = hw.Load(dir + "2019-12-03.html"); //取rootNode HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode; //获取 Table/tbody string xpath = "//*[@id=\"DDetail2\"]/tbody"; HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath); //删除 tr之间的#Text子对象 foreach (var script in node.Descendants("#Text").ToArray()) script.Remove(); if (node.ChildNodes.Count>1) { List<dailyDetail> li = new List<dailyDetail>(); //node.ChildNodes.Count - 1 去除最后一个新建行 for (int i = 0; i < node.ChildNodes.Count - 1; i++) { //取子Node (相对 xpath) var id = node.ChildNodes[i].SelectSingleNode($"td[1]/input[2]"); var text = node.ChildNodes[i].SelectSingleNode($"td[2]/input"); li.Add(new dailyDetail() { dailyDetailId= id.Attributes["value"].Value ,dailyContent= text.Attributes["value"].Value }); } //找出重复值 var query = (from dd in li where dd.dailyContent != null group dd by new { dd.dailyContent } into g where g.Count() > 1 select new { g.Key.dailyContent }).ToList(); foreach (var item in query) { Console.WriteLine($"重复值:{item.dailyContent}"); Console.WriteLine($"首个Id:{li.FirstOrDefault(q=>q.dailyContent==item.dailyContent)?.dailyDetailId}"); } }