HtmlAgilityPack 网页数据抓取基础应用
1 var doc = new HtmlAgilityPack.HtmlDocument(); 2 string html=""; 3 doc.LoadHtml(html); 4 Func<HtmlAgilityPack.HtmlNodeCollection, string, Dictionary<string, string>, bool, List<HtmlAgilityPack.HtmlNode>> getNodeByTagNameAndAttr = null; 5 Func<HtmlAgilityPack.HtmlNode, string, Dictionary<string, string>, bool> fun_Match = (c, tagName, Attr) => 6 { 7 return c.Name == tagName && !Attr.Select(cc => cc.Key).Except(c.Attributes.Select(cc => cc.Name)).Any() && c.Attributes.Join(Attr, a => new { a.Name, a.Value }, b => new { Name = b.Key, b.Value }, (a, b) => 1).Count() == Attr.Count; 8 }; 9 getNodeByTagNameAndAttr = (nodes, tagName, Attr, all) => 10 { 11 List<HtmlAgilityPack.HtmlNode> li = new List<HtmlAgilityPack.HtmlNode>(); 12 13 foreach (var c in nodes) 14 { 15 if (fun_Match(c, tagName, Attr)) 16 { 17 li.Add(c); 18 } 19 if (!all && li.Count > 0) 20 { 21 break; 22 } 23 if (c.HasChildNodes) 24 { 25 var nt = getNodeByTagNameAndAttr(c.ChildNodes, tagName, Attr, all); 26 if (nt != null) 27 { 28 li.AddRange(nt); 29 } 30 } 31 if (!all && li.Count > 0) 32 { 33 break; 34 } 35 36 } 37 return li; 38 39 }; 40 41 //读取doc.DocumentNode.ChildNodes 元素下的a标签(参数可以包含属性,也可以指定是否获取所有元素 42 List<HtmlAgilityPack.HtmlNode> node = getNodeByTagNameAndAttr(doc.DocumentNode.ChildNodes, "a", new Dictionary<string, string> { }, true);