HtmlAgilityPack Sample

通过html Table获取到内部数据,并执行去重.

                HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();
                //加载本地文件  (之前是通过System.Net.Http.HttpClient post采集到的)
                HtmlAgilityPack.HtmlDocument doc = hw.Load(dir + "2019-12-03.html");
                //取rootNode
                HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode;
                //获取 Table/tbody
                string xpath = "//*[@id=\"DDetail2\"]/tbody";
                HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath);
                //删除 tr之间的#Text子对象
                foreach (var script in node.Descendants("#Text").ToArray())
                    script.Remove();
                if (node.ChildNodes.Count>1)
                {
                    List<dailyDetail> li = new List<dailyDetail>();
                    //node.ChildNodes.Count - 1 去除最后一个新建行
                    for (int i = 0; i < node.ChildNodes.Count - 1; i++)
                    {
                        //取子Node (相对 xpath)
                        var id = node.ChildNodes[i].SelectSingleNode($"td[1]/input[2]");
                        var text = node.ChildNodes[i].SelectSingleNode($"td[2]/input");
                        li.Add(new dailyDetail() { dailyDetailId= id.Attributes["value"].Value ,dailyContent= text.Attributes["value"].Value });
                    }
                    //找出重复值
                    var query = (from dd in li
                                 where
                                   dd.dailyContent != null
                                 group dd by new
                                 {
                                     dd.dailyContent
                                 } into g
                                 where g.Count() > 1
                                 select new
                                 {
                                     g.Key.dailyContent
                                 }).ToList();


                    foreach (var item in query)
                    {
                        Console.WriteLine($"重复值:{item.dailyContent}");
                        Console.WriteLine($"首个Id:{li.FirstOrDefault(q=>q.dailyContent==item.dailyContent)?.dailyDetailId}");

                    }

                }

  

posted @ 2020-05-13 17:27  devs  阅读(232)  评论(0编辑  收藏  举报