使用Xpath从网页中获取数据

 /// <summary>
        /// 从官方网站中抓取产品信息存放在本地数据库中
        /// </summary>
        /// <returns></returns>
        public List<ProductMessage> GetlistProductMessage()
        {

            string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products");
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(html);
            HtmlNode rootNode = document.DocumentNode;

            /*//*[@class='list-product']为元素的XPath标记实例,
             * 表示所有使用class="list-product"的节点
             */
            HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class='list-product']");

            List<ProductMessage> products = new List<ProductMessage>();
            foreach (HtmlNode node in rootNodeList)
            {
                ProductMessage db_product = new ProductMessage();
                HtmlDocument docu = new HtmlDocument();
                docu.LoadHtml(node.InnerHtml);
                HtmlNode ro = docu.DocumentNode;
                db_product.Code = Formsub(ro.SelectSingleNode("//*[@style='float:right;']").InnerText);
                string Code = db_product.Code;
                    List<ProductMessage> Productlist = ProductMessage.GetProductList(Code,"");
                
                    if (Productlist.Count>0)
                    {
                        db_product.Name = Formsub(ro.SelectSingleNode("//*[@style='float:left;']").InnerText);
                        /*获取a节点中href标签的属性值*/
                        db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value);
                        string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + "");
                        HtmlDocument descmationDo = new HtmlDocument();
                        descmationDo.LoadHtml(descmationhtml);
                        HtmlNode descmationNode = descmationDo.DocumentNode;
                        db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class='product-desc']").InnerHtml).Replace("'", "");

                        if (descmationNode.SelectSingleNode("//*[@class='details-tile']") != null)
                        {
                            db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile']").InnerHtml.Replace("Departing From", ""));
                        }
                        if (descmationNode.SelectSingleNode("//*[@class='details-tile details-list']") != null)
                        {
                            db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile details-list']").InnerHtml.Replace("Product Highlights", "")).Replace("'", "");
                        }

                        #region
                        try
                        {
                            ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code);
                        }
                        catch { }
                        #endregion

                        #region
                        if (descmationNode.SelectSingleNode("//*[@class='product-equip']") != null)
                        {
                            HtmlDocument DesmationEquipment = new HtmlDocument();
                            DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class='product-equip']").InnerHtml);
                            HtmlNode EquipmentNode = DesmationEquipment.DocumentNode;
                            HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div");

                            List<EquipmentModel> EquipmentString = new List<EquipmentModel>();
                            foreach (HtmlNode equipment in EquipmentNodes)
                            {
                                EquipmentModel Equipment_model = new EquipmentModel();
                                Equipment_model.Name = equipment.Attributes["title"].Value;
                                Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png";

                                try
                                {
                                    ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl);
                                }
                                catch { }
                                EquipmentString.Add(Equipment_model);
                            }
                            db_product.Equipment = EquipmentString;
                        }
                        #endregion



                        #region
                        if (descmationNode.SelectNodes("//*[@title='See full size image']") != null)
                        {
                            HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title='See full size image']");
                            List<ImageModel> ImageString = new List<ImageModel>();
                            foreach (HtmlNode imagenode in ImageNodes)
                            {
                                ImageModel image_model = new ImageModel();

                                HtmlDocument imageDo = new HtmlDocument();
                                imageDo.LoadHtml(imagenode.InnerHtml);
                                HtmlNode imgRo = imageDo.DocumentNode;
                                //原图片地址
                                string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value;

                                image_model.ImageUrl = FromPath;
                                try
                                {
                                    ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl);
                                }
                                catch { }
                            }
                        }
                        #endregion
                        products.Add(db_product);
                    }
            }
            return products;
        }
View Code

 

 

Xpath是将html作为类似xml的格式进行获取的,主要通过节点的不同标示,获取不同内容,可以从网页中获取想要的数据,与网页爬虫不同。

posted @ 2016-07-29 16:59  ly77461  阅读(2434)  评论(0编辑  收藏  举报