随便搞的个抓火车票的程序

背景不介绍,目的是为了个火车票数据。可惜真正研究了才发现各种车次的数据格式不统一,最终我都没有来得及搞出个完成格式

 "日期", "停车时间", "开车时间", "里程", "硬座", "硬卧中铺", "软座", "软卧下铺" ,"高级软卧下铺"    大致上是这个,程序比较粗糙.

-------------------------

//C#程序,解析dom树的方式抓所有火车车次详细数据,保存为xml格式,mark掉一句代码~

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.Xml;
using System.IO;
using System.Diagnostics;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Filters;

namespace GetTrainLine
{
    class Program
    {
        private const string strUrlCur = "http://www.tielu.org/TrainList/";
        //static private string[] arrTrLineColumnName = new string[] { "No","车站","火车票交流","酒店机票", "日期", "停车时间", "开车时间", "里程", "硬座", "硬卧中铺", "软座", "软卧下铺" ,"高级软卧下铺"};
       
        private string[] arrDescription = new string[] { "运行时间","始发站","终点站","发车时间","到站时间","类型","全程"};
        [STAThread]
        static void Main(string[] args)
        {
            string[] arrUrl = new string[27];
            for (int i = 0; i <arrUrl.Length; i++ )
            {
                int num = i + 1;
                arrUrl[i] = strUrlCur + "TrainList-" +num.ToString() + ".html";
            }
            string strXmlFilename = "train.xml";
            XmlDocument xmlTrain = new XmlDocument();
            parseTrainListHtml(arrUrl, xmlTrain);
        }
        private static Boolean parseTrainListHtml(string[] arrUrl, XmlDocument xmlTrain)
        {
            System.Net.WebClient client = new WebClient();
            XmlElement trainLineInfoElement = xmlTrain.createElement_x("TrainLineInfo");
            foreach (string strUrl in arrUrl)
            {
                string strContent = client.DownloadString(strUrl);
                string regex = "../Search/[A-Z0-9-]{1,25}\\.html";
                Regex re = new Regex(regex);
                MatchCollection matches = re.Matches(strContent);

                System.Collections.IEnumerator enu = matches.GetEnumerator();
                while (enu.MoveNext() && enu.Current != null)
                {
                    Match match = (Match)(enu.Current);
                    Console.Write(match.Value + "\r\n");
                    string strTrainlineContent = client.DownloadString(strUrlCur + match.Value);
                    Lexer lexer = new Lexer(strTrainlineContent);
                    Parser parser = new Parser(lexer);
                    NodeList htmlNodes = parser.Parse(null);
                    INode htmlNode = htmlNodes[3];
                    htmlNode = htmlNode.Children[?].Children[?].Children[?].Children[?].Children[?];
                    INode desNode = htmlNode.Children[6];
                    INode lineNode = htmlNode.Children[8];
                    //新建element
                    XmlElement eleTrainline = xmlTrain.createElement_x("TrainLine");
                    XmlElement tmpElem;
                    if (desNode is TableTag)
                    {
                        TableTag tag = (desNode as TableTag);
                        if (!tag.IsEndTag())
                        {
                            for (int i = 0; i < tag.RowCount; i++)
                            {
                                TableRow tableRow = tag.GetRow(i);
                                string[] strColumn = new string[4];
                                for (int j = tableRow.ColumnCount - 4, k = 0; j < tableRow.ColumnCount; j++, k++)
                                {
                                    TableColumn tableColumn = tableRow.Columns[j];
                                    strColumn[k] = tableColumn.ToPlainTextString();
                                }
                                for (int j = 0; 2 * j < 4; j++ )
                                {
                                    tmpElem = xmlTrain.createElement_x(strColumn[2 * j]);
                                    tmpElem.InnerText = strColumn[2 * j + 1];
                                    eleTrainline.AppendChild(tmpElem);
                                }
                                trainLineInfoElement.AppendChild(eleTrainline);
                            }
                        }
                    }

                   
                    //各车站信息
                    int[] arrActiveIndex = new int[] {1,4,5,6,7,8,9,10,11,12,13,14,15};
                    if (lineNode is TableTag)
                    {
                        TableTag tag = (lineNode as TableTag);
                        if (!tag.IsEndTag())
                        {
                            string[] arrTrLineColumnName = new string[tag.GetRow(0).ColumnCount];
                            {
                                TableRow tableRow = tag.GetRow(0);
                                for (int j = 0; j < tableRow.ColumnCount; j++ )
                                {
                                    TableColumn tableColumn = tableRow.Columns[j];
                                    arrTrLineColumnName[j] = tableColumn.ToPlainTextString();
                                }
                            }
                            for (int i = 1; i < tag.RowCount; i++ )
                            {
                                TableRow tableRow = tag.GetRow(i);
                                string[] arrTrlineColumn = new string[13];
                                int curTrainLineColumnCount = tableRow.ColumnCount;

                                //有些车次居然没有价格表,然后有的车次还有高级软卧下铺
                                //Trace.Assert(tableRow.ColumnCount == 12, "html源码中 车站row的td数不为12");
                                for (int j = 0; j < tableRow.ColumnCount; j++ )
                                {
                                    TableColumn tableColumn = tableRow.Columns[j];
                                    arrTrlineColumn[j] = tableColumn.ToPlainTextString();
                                }
                                XmlElement xmlTrainStation = xmlTrain.createElement_x("TrainStationInfo");
                                eleTrainline.AppendChild(xmlTrainStation);
                                for (int j = 0; j < arrActiveIndex.Length && arrActiveIndex[j] < curTrainLineColumnCount; j++)
                                {
                                    tmpElem = xmlTrain.createElement_x(arrTrLineColumnName[arrActiveIndex[j]]);
                                    tmpElem.InnerText = arrTrlineColumn[arrActiveIndex[j]];
                                    xmlTrainStation.AppendChild(tmpElem);
                                }
                            }
                        }
                    }
                   
                    trainLineInfoElement.AppendChild(eleTrainline);
                    xmlTrain.AppendChild(trainLineInfoElement);
                }
          
            }
            xmlTrain.Save("E:/abc.xml");
            return true;
        }
    }
}

posted @ 2013-03-15 15:09  J.Z's World  阅读(269)  评论(0编辑  收藏  举报