抓取网页(table)信息(导出到数据库中、excel中)
1、存储数据到数据库中:
1)获取网页中table中的列名,作为数据库表的列名(@"<th scope=""col"">";和"</th>";是table中的包括列名的标签)
/// <summary> /// 获取列名 /// </summary> /// <param name="sbHtmltext"></param> public void GetColomnNumAndName(string sbHtmltext) { int i = 0; for (i = 0; ; i++) { string ColomnName = ""; string StartColoumnName = @"<th scope=""col"">"; string EndColoumnName = "</th>"; int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName); if (StartColoumnNameIndex > -1) { int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName); int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length; ColomnName = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs); dt.Columns.Add(ColomnName.ToString().Trim()); sbHtmltext = sbHtmltext.Remove(0, (EndColoumnNameIndex + EndColoumnName.Length)); } else { break; } } }
2)获取table中的数据
/// <summary> /// 获取数据 /// </summary> /// <param name="sbHtmltext"></param> public void GetValue(string sbHtmltext) { //去掉页面头 string StartColoumnName = @"<table cellspacing=""0"" rules=""all"" border=""1"" id=""ctl00_ContentPlaceHolder1_gvChanpin"" style=""border-collapse:collapse;"">"; string EndColoumnName = @"<td colspan=""14""><table border=""0"">"; string StartValue = "<td>"; string EndValue = "</td>"; int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName); int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName); int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length; string sbHtmltextChild = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs);//删除不包含数据的上下元素 for (int i = 0; ; i++) { string Value = ""; int StartChild = sbHtmltextChild.IndexOf(StartValue); if (StartChild > -1) { DataRow dr = dt.NewRow(); for (int j = 0; ; j++) { int StartValueIndex = sbHtmltextChild.IndexOf(StartValue); if (StartValueIndex > -1 && StartValueIndex != 18) { int EndValueIndex = sbHtmltextChild.IndexOf(EndValue); int longsChild = EndValueIndex - StartValueIndex - StartValue.Length; Value = sbHtmltextChild.Substring(StartValueIndex + StartValue.Length, longsChild).Replace("\r\n", "").Replace("\t", "").Replace("\"", """).Trim(); dr[j] = Value.ToString().Trim(); sbHtmltextChild = sbHtmltextChild.Remove(0, (EndValueIndex + EndValue.Length)); } else { break; } } dt.Rows.Add(dr); int removelen = sbHtmltextChild.IndexOf(StartValue); if (removelen > -1) { sbHtmltextChild = sbHtmltextChild.Remove(0, removelen); } } else { break; } } }
3)button按钮是触发事件,调用方法:
try { if (txttablename.Text== "") { MessageBox.Show("请输入表名!"); return; } string sbHtmltext = WBdata.DocumentText;//获取所有页面元素 GetColomnNumAndName(sbHtmltext);//获取列名 GetValue(sbHtmltext);//获取数据 if (dt != null && dt.Rows.Count > 0) { string descTableName = txttablename.Text.Trim(); string str = bll.SourceExcel_Import_Web(descTableName, dt); dt.Reset();//重置数据容器DataTable MessageBox.Show(str); // 定位控件webBrowser滚动条的位置 HtmlDocument document = this.WBdata.Document; document.Window.ScrollTo(750, 1200); } else { MessageBox.Show("未采集到数据!"); } } catch (Exception ex) { MessageBox.Show(ex.Message); }
4)DAL层功能的实现
/// 导入数据 /// <summary> /// </summary> /// <param name="descTableName"></param> /// <param name="dtSource"></param> /// <returns></returns> public string SourceExcel_Import_Web(string descTableName, DataTable dtSource) { string succ = ""; try { string column = ""; for (int i = 0; i < dtSource.Columns.Count; i++) { if (i == 0) { column += dtSource.Columns[i].ColumnName + " varchar(max)"; } else { column += "," + dtSource.Columns[i].ColumnName + " varchar(max)"; } } string strsql = "if not exists(select * from sys.objects where name ='" + descTableName + "')begin create table " + descTableName + "(" + column + ")end "; HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql); //HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql)是封装好的DBHelper
SqlBulkCopyOptions sqlBulkCopyOptions; sqlBulkCopyOptions = SqlBulkCopyOptions.FireTriggers; SqlBulkCopy DTS = new System.Data.SqlClient.SqlBulkCopy(connectWeb, sqlBulkCopyOptions); DTS.NotifyAfter = 1; DTS.DestinationTableName = descTableName; DTS.BulkCopyTimeout = 60000000; DTS.WriteToServer(dtSource); succ = "保存成功!" + dtSource.Rows.Count + "条数据"; } catch (Exception ex) { succ = ex.Message; } return succ; } }
总结:应该先在数据库中手动的创建一个数据库,表名是手动输入的。。
2、导出excel
1)导出excel的方法。(common.file)
private static Form _openerForm; /// <summary> /// 所属窗体 /// </summary> public static Form OpenerForm { set { _openerForm = value; } get { return _openerForm; } } /// <summary> /// 导出到excel /// </summary> /// <param name="dataTable"></param> /// <returns></returns> public static bool ExportDataTableToExcel(System.Data.DataTable dataTable) { SaveFileDialog saveFileDialog = new SaveFileDialog(); saveFileDialog.Filter = "Execl files (*.xls)|*.xls"; saveFileDialog.FilterIndex = 0; saveFileDialog.RestoreDirectory = true; saveFileDialog.CreatePrompt = false; saveFileDialog.Title = "导出Excel文件到"; if (saveFileDialog.ShowDialog() == DialogResult.Cancel) { return false; } excel9.Application oXL; excel9._Workbook oWB; excel9._Worksheet oSheet; excel9.Range oRng; try { oXL = new excel9.Application(); oXL.Visible = false; oWB = (excel9._Workbook)(oXL.Workbooks.Add(Missing.Value)); oSheet = (excel9._Worksheet)oWB.ActiveSheet; int _RowCount = dataTable.Rows.Count; int _ColumnCount = dataTable.Columns.Count; for (int i = 0; i < _ColumnCount; i++) { oSheet.Cells[1, i + 1] = dataTable.Columns[i].Caption; } for (int j = 0; j < _RowCount; j++) { for (int k = 0; k < _ColumnCount; k++) { oSheet.Cells[j + 2, k + 1] = dataTable.Rows[j].ItemArray[k].ToString(); } } string ExcelHeader = GetExcelHeader(_ColumnCount); oWB.SaveAs ( saveFileDialog.FileName, excel9.XlFileFormat.xlExcel7, Type.Missing, Type.Missing, Type.Missing, Type.Missing, excel9.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing ); oWB.Close(null, null, null); oXL.Workbooks.Close(); oXL.Quit(); return true; } catch (Exception e) { MessageBox.Show(e.ToString()); return false; } finally { } }
private static string GetExcelHeader(int number) { string[] Header = new string[] { " ", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "W", "Z" }; if (number <= 26) { return Header[number]; } else { return GetExcelHeader(number / 26) + Header[number % 26]; } }
2)button按钮下的方法
try { string sbHtmltext = WBdata.DocumentText;//获取所有页面元素 GetColomnNumAndName(sbHtmltext);//获取列名 GetValue(sbHtmltext);//获取数据 //判断导出是否成功 if (common.File.ExportDataTableToExcel(dt)) { MessageBox.Show("导出成功!"); dt.Reset(); // 定位控件webBrowser滚动条的位置 HtmlDocument document = this.WBdata.Document; document.Window.ScrollTo(750, 1000); } else { MessageBox.Show("导出数据失败!"); } } catch(Exception ex) { MessageBox.Show(ex.Message); }