使用Xpath从网页中获取数据
/// <summary> /// 从官方网站中抓取产品信息存放在本地数据库中 /// </summary> /// <returns></returns> public List<ProductMessage> GetlistProductMessage() { string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products"); HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode rootNode = document.DocumentNode; /*//*[@class='list-product']为元素的XPath标记实例, * 表示所有使用class="list-product"的节点 */ HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class='list-product']"); List<ProductMessage> products = new List<ProductMessage>(); foreach (HtmlNode node in rootNodeList) { ProductMessage db_product = new ProductMessage(); HtmlDocument docu = new HtmlDocument(); docu.LoadHtml(node.InnerHtml); HtmlNode ro = docu.DocumentNode; db_product.Code = Formsub(ro.SelectSingleNode("//*[@style='float:right;']").InnerText); string Code = db_product.Code; List<ProductMessage> Productlist = ProductMessage.GetProductList(Code,""); if (Productlist.Count>0) { db_product.Name = Formsub(ro.SelectSingleNode("//*[@style='float:left;']").InnerText); /*获取a节点中href标签的属性值*/ db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value); string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + ""); HtmlDocument descmationDo = new HtmlDocument(); descmationDo.LoadHtml(descmationhtml); HtmlNode descmationNode = descmationDo.DocumentNode; db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class='product-desc']").InnerHtml).Replace("'", ""); if (descmationNode.SelectSingleNode("//*[@class='details-tile']") != null) { db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile']").InnerHtml.Replace("Departing From", "")); } if (descmationNode.SelectSingleNode("//*[@class='details-tile details-list']") != null) { db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile details-list']").InnerHtml.Replace("Product Highlights", "")).Replace("'", ""); } #region try { ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code); } catch { } #endregion #region if (descmationNode.SelectSingleNode("//*[@class='product-equip']") != null) { HtmlDocument DesmationEquipment = new HtmlDocument(); DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class='product-equip']").InnerHtml); HtmlNode EquipmentNode = DesmationEquipment.DocumentNode; HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div"); List<EquipmentModel> EquipmentString = new List<EquipmentModel>(); foreach (HtmlNode equipment in EquipmentNodes) { EquipmentModel Equipment_model = new EquipmentModel(); Equipment_model.Name = equipment.Attributes["title"].Value; Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png"; try { ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl); } catch { } EquipmentString.Add(Equipment_model); } db_product.Equipment = EquipmentString; } #endregion #region if (descmationNode.SelectNodes("//*[@title='See full size image']") != null) { HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title='See full size image']"); List<ImageModel> ImageString = new List<ImageModel>(); foreach (HtmlNode imagenode in ImageNodes) { ImageModel image_model = new ImageModel(); HtmlDocument imageDo = new HtmlDocument(); imageDo.LoadHtml(imagenode.InnerHtml); HtmlNode imgRo = imageDo.DocumentNode; //原图片地址 string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value; image_model.ImageUrl = FromPath; try { ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl); } catch { } } } #endregion products.Add(db_product); } } return products; }
Xpath是将html作为类似xml的格式进行获取的,主要通过节点的不同标示,获取不同内容,可以从网页中获取想要的数据,与网页爬虫不同。