C#将HTML表格<TABLE>转换成DataTable

1.最近在写爬虫的时候,有的数据是用HTML的<TABLE>披露的,披露的时候会包含rowspan和colspan,

下图是个简单的例子:

 

对应的HTML代码如下:

<table border="1">
<tr>
  <td>Column1</td>
  <td>Column2</td>
  <td>Column3</td>
  <td>Column4</td>
  <td>Column5</td>
  <td>Column6</td>
</tr>
<tr>
  <td rowspan=3>1</td>
  <td>2</td>
  <td>3</td>
  <td>4</td>
  <td>5</td>
  <td>6</td>
</tr>
<tr>
  <td>7</td>
  <td rowspan=2 colspan=3>8</td>
  <td>9</td>
</tr>
<tr>
  <td>10</td>
  <td>11</td>
</tr>
</table>
HTML Code

2.我们需要的数据应该是下面这个样子的,才比较方便处理

3.那么如何转换呢,这里我们需要引用HtmlAgilityPack.dll

代码如下:

using System;
using System.Data;
using System.Linq;
using HtmlAgilityPack;

namespace HtmlToDataTable
{
    static class Program
    {
        /// <summary>
        /// 应用程序的主入口点。
        /// </summary>
        [STAThread]
        static void Main()
        {
            const string hrml = "<table border=\"1\"><tr><td>Column1</td><td>Column2</td><td>Column3</td><td>Column4</td><td>Column5</td><td>Column6</td></tr><tr><td rowspan=3>1</td><td>2</td><td></td><td>4</td><td>5</td><td>6</td></tr><tr><td></td><td rowspan=2 colspan=3>7</td><td>9</td></tr><tr><td></td><td>8</td></tr></table>";
            var dt = HtmlToDataTable(hrml);
        }

        public static DataTable HtmlToDataTable(string hrml)
        {
            const string nulltxt = "-yellow3gold-";
            var dt = new DataTable();
            var doc = new HtmlDocument();
            doc.LoadHtml(hrml);
            var tList = doc.DocumentNode.SelectNodes("//table");
            if (tList != null)
            {
                var table = tList[0];
                var rows = table.SelectNodes("//tr");
                if (rows != null)
                {
                    var colCount = 0;
                    foreach (var td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
                    {
                        var attr = td.Attributes["colspan"];
                        var colspan = (attr != null) ? int.Parse(attr.Value) : 1;
                        colCount = colCount + colspan;
                    }
                    var rowCount = rows.Count;
                    var arr = new string[rowCount][];
                    for (var r = 0; r < rowCount; r++)
                    {
                        arr[r] = new string[colCount];
                    }
                    //填充数据
                    for (var row = 0; row < rowCount; row++)
                    {
                        var tr = rows[row];
                        var cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
                        for (var column = 0; column < cols.Count; column++)
                        {
                            var cAttr = cols[column].Attributes["colspan"];
                            var colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
                            var rAttr = cols[column].Attributes["rowspan"];
                            var rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
                            var text = string.IsNullOrEmpty(cols[column].InnerText) ? nulltxt : cols[column].InnerText;
                            var startColumn = 0;
                            for (var i = 0; i < rowspan; i++)
                            {
                                for (var j = 0; j < colspan; j++)
                                {
                                    var d = startColumn == 0 ? column : startColumn;
                                    if (string.IsNullOrEmpty(arr[row + i][d + j]))
                                        arr[row + i][d + j] = text;
                                    else
                                    {
                                        var t = column + j + 1;
                                        startColumn = t;
                                        while (true)
                                        {
                                            if (string.IsNullOrEmpty(arr[row][t]))
                                            {
                                                arr[row][t] = text;
                                                break;
                                            }
                                            t++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                    for (var i = 0; i < arr.Length; i++)
                    {
                        if (i == 0)
                        {
                            for (var j = 0; j < arr[i].Length; j++)
                            {
                                var columnTxt = arr[i][j] == nulltxt ? "Column" + j : arr[i][j];
                                dt.Columns.Add(columnTxt);
                            }
                        }
                        else
                        {
                            var row = dt.NewRow();
                            for (var k = 0; k < arr[i].Length; k++)
                            {
                                var columnTxt = arr[i][k] == nulltxt ? "" : arr[i][k];
                                row[k] = columnTxt;
                            }
                            dt.Rows.Add(row);
                        }
                    }
                }
            }
            return dt;
        }
    }
}
View Code

4.转换成DataTable入库就比较方便了。

代码未经严格测试,如有不当之处,敬请指出!

posted @ 2021-02-03 14:19  新*  阅读(1681)  评论(1编辑  收藏  举报