c# 抓取和解析网页,并将table数据保存到datatable中(其他格式也可以,自己去修改)
使用HtmlAgilityPack 基础请参考这篇博客:https://www.cnblogs.com/fishyues/p/10232822.html
下面是根据抓取的页面string 来解析并保存到datatable中:
//HtmlString 获取的html页面的字符串 //XmlPath 解析元素在html中的位置,像:XmlPath = "/html/body/div[3]/div[3]/div[1]/table" public static DataTable ParsingWeb(string HtmlString, string XmlPath) { try { //HtmlWeb web = new HtmlWeb(); //HtmlDocument doc = web.Load(WebUrl); var doc = new HtmlDocument(); doc.LoadHtml(HtmlString); DataTable htTable = new DataTable(); var tablehtml = doc.DocumentNode.SelectSingleNode(XmlPath); if (tablehtml == null) { return null; } var TrSelected = tablehtml.SelectNodes(".//tr"); foreach (HtmlNode row in TrSelected) { var Index = TrSelected.IndexOf(row); if (TrSelected.IndexOf(row) == 0) { foreach (HtmlNode cell in row.SelectNodes("th|td")) //有些table 表头是写在 td中的 { htTable.Columns.Add(cell.InnerText, typeof(string)); } } else { DataRow TempRow = htTable.NewRow(); foreach (HtmlNode cell in row.SelectNodes("th|td")) { var position = row.SelectNodes("th|td").IndexOf(cell); TempRow[htTable.Columns[position].ColumnName] = cell.InnerText; } htTable.Rows.Add(TempRow); } } return htTable; } catch (Exception e) { return null; } }