关于使用HtmlAgilityPack

请直接看代码:

 

        /// <summary>
        
/// 根据输入的地址获取其文档节点对象
        
/// </summary>
        
/// <param name="url">地址</param>
        
/// <returns></returns>
        public static HtmlAgilityPack.HtmlNode GetHtmlNodeFromLink(string url)
        {
            try{
                Uri uri = new Uri(url);

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
                WebResponse response = request.GetResponse();

                Stream stream = response.GetResponseStream();
                StreamReader read = new StreamReader(stream, Encoding.GetEncoding("gb2312"));
                string str = read.ReadToEnd();

                HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
                html.LoadHtml(str);
                return html.DocumentNode;
            }
            catch{return null;}
        }

        /// <summary>
        
/// 根据输入的URL地址输出指定XPATH下的节点集合
        
/// </summary>
        
/// <param name="url">地址</param>
        
/// <param name="xPath">过滤地址</param>
        
/// <param name="imgs">过滤地址</param>
        
/// <param name="links">过滤地址</param>
        
/// <param name="title">标题</param>
        
/// <returns></returns>
        public static bool GetGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode,string xPath,ref string[] imgs, ref string[] links,ref string[] title)
        {
            try
            {
                HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
                if (hnc.Count < 1)
                    return false;
                links = new string[hnc.Count];
                title = new string[hnc.Count];
                imgs = new string[hnc.Count];
                int i = 0;
                string cateDataRegex = @"background-image:url\((?<image>.+)\)";
                Regex re = new Regex(cateDataRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
                foreach (HtmlNode node in hnc)
                {
                    HtmlAttributeCollection hac = node.Attributes;
                    links[i] = hac["href"].Value;
                    imgs[i] = hac["style"] == null ? hac["src2"].Value : re.Match(hac["style"].Value).Groups["image"].Value;
                    title[i++] = string.IsNullOrEmpty(hac["title"].Value) ? hac["alt"].Value : hac["title"].Value;
                }
                return true;
            }
            catch { return false; }
        }
        
        //调用 
        
            string[] strLink;
            string[] strLinAlt;
            string[] strImg;
            string urls = "http://www.newegg.com.cn";
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink(urls);
            GetGalleryInfo(nodes, "//div[@class='slideBannerA homeSlideAD1']/div[1]/div[1]/a"out strImg, out strLink,out strLinAlt);

 

淘宝今日活动:

/// <summary>
        
/// 淘宝今日活动
        
/// </summary>
        
/// <param name="htmlNode">页面节点集合</param>
        
/// <param name="xPath">选择的路径</param>
        
/// <param name="imgs">图片集合</param>
        
/// <param name="links">链接集合</param>
        
/// 调用:
        
///    string[] strLink;
        
///    string[] strImg;
        
///    HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
        
///    GetTaobaoGalleryInfo(nodes, "//div[@class='sub-promotion-content']/div[@class='ks-switchable-content zoom']/ul/li", out strImg, out strLink);
        
/// <returns></returns>
        public static bool GetTaobaoGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode, string xPath, out string[] imgs, out string[] links)//, ref string[] title)
        {
            HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
            links = new string[hnc.Count];
            imgs = new string[hnc.Count];
            try
            {
                if (hnc.Count < 1)
                    return false;
                int i = 0;
                foreach (HtmlNode node in hnc)
                {
                    links[i] = node.ChildNodes[1].Attributes["href"].Value;
                    imgs[i++] = node.ChildNodes[1].ChildNodes[0].Attributes["src"].Value;
                }
                return true;
            }
            catch { return false; }
        }

 

 

 //今日炸弹
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.newegg.com.cn/");

            HtmlAgilityPack.HtmlNode node = nodes.SelectSingleNode("//div[@class='colSub']/div[@class='picBanner shellShocker ']/a");//"//div[@class='slideBannerA homeSlideAD1']"
           
            string strImg = node.Attributes["href"].Value;
            string strSrc= node.ChildNodes[0].Attributes["src"].Value;

 

 

            //淘宝类别活动
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//span[@class='category-pop']/a");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strText = new string[node.Count];

            try
            {
                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    strLink[i] = htmlNode.Attributes["href"].Value;
                    strText[i++] = htmlNode.InnerText;
                }
            }
            catch { }

 

 

//淘宝-服侍-新品推荐
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://fushi.taobao.com");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='new-product-image-list']/ul[@class='image-list']/li");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];

            try
            {
                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                    strAlt[i] = htmlNode.ChildNodes[0].ChildNodes[1].InnerHtml;
                    strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                }
            }
            catch { }

 

 

//针织衫推荐
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://rihan.vancl.com/","UTF-8");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='prod_area']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];
            string[] strPrice = new string[node.Count];
            string[] strCurrentPrice = new string[node.Count];

            int i = 0;
            foreach (HtmlNode htmlNode in node)
            {
                try
                {
                    strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                    strAlt[i] = htmlNode.ChildNodes[4].ChildNodes[1].InnerHtml.Trim();
                    strImg[i] = htmlNode.ChildNodes[0].ChildNodes[1].Attributes["src"].Value;
                    strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("""");
                    strCurrentPrice[i++] = htmlNode.ChildNodes[6].ChildNodes[2].InnerHtml.Trim().Replace("售价¥""");
                }
                catch { }
            }

 

 

        private void button8_Click(object sender, EventArgs e)
        {
            //http://www.masamaso.com  商品列表
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/""UTF-8");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//ul/li/div[@class='goods_case']");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];
            string[] strPrice = new string[node.Count];
            string[] strCurrentPrice = new string[node.Count];

            int i = 0;
            foreach (HtmlNode htmlNode in node)
            {
                try
                {
                    strLink[i] = "http://www.masamaso.com/" + htmlNode.ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
                    strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                    strImg[i] = htmlNode.ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                    //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                    strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;""");
                }
                catch 
                { }
            }
        }

        private void button9_Click(object sender, EventArgs e)
        {
            //http://www.masamaso.com/  弹出广告
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/""UTF-8");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];
            string[] strPrice = new string[node.Count];
            string[] strCurrentPrice = new string[node.Count];

            int i = 0;
            foreach (HtmlNode htmlNode in node)
            {
                try
                {
                    strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                    //strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                    strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                    //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                    
//strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;", "");
                }
                catch 
                { }
            }

            Func();
        }

        private void Func()
        {
            //http://www.vivian.cn/  弹出广告
            HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/""UTF-8");
            HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];
            string[] strPrice = new string[node.Count];
            string[] strCurrentPrice = new string[node.Count];

            int i = 0;
            foreach (HtmlNode htmlNode in node)
            {
                try
                {
                    strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                    //strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                    strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                    //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                    
//strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;", "");
                }
                catch 
                { }
            }

        }

        private void button10_Click(object sender, EventArgs e)
        {
            //http://www.vivian.cn/"   产品列表
             HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/""UTF-8");
             HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='goods_list']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

            string[] strLink = new string[node.Count];
            string[] strImg = new string[node.Count];
            string[] strAlt = new string[node.Count];
            string[] strPrice = new string[node.Count];
            string[] strCurrentPrice = new string[node.Count];

            int i = 0;
            foreach (HtmlNode htmlNode in node)
            {
                try
                {
                    strLink[i] = "http://www.vivian.cn/" + htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
                    strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                    strImg[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                    //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                    strCurrentPrice[i++] = htmlNode.ChildNodes[1].ChildNodes[3].ChildNodes[3].ChildNodes[0].ChildNodes[1].InnerHtml.Trim().Replace("&yen;""");
                }
                catch 
                { }
            }
        }

 

 

 

 

 

 

posted @ 2011-10-27 16:49  Care健康  阅读(3802)  评论(0编辑  收藏  举报
版权
作者:Bober Song

出处:http://bober.cnblogs.com

Care健康:http://www.aicareyou.com

推荐空间:华夏名网

本文首发博客园,版权归作者跟博客园共有。

转载必须保留本段声明,并在页面显著位置给出本文链接,否则保留追究法律责任的权利。