随便搞的个抓火车票的程序
背景不介绍,目的是为了个火车票数据。可惜真正研究了才发现各种车次的数据格式不统一,最终我都没有来得及搞出个完成格式
"日期", "停车时间", "开车时间", "里程", "硬座", "硬卧中铺", "软座", "软卧下铺" ,"高级软卧下铺" 大致上是这个,程序比较粗糙.
-------------------------
//C#程序,解析dom树的方式抓所有火车车次详细数据,保存为xml格式,mark掉一句代码~
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.Xml;
using System.IO;
using System.Diagnostics;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Filters;
namespace GetTrainLine
{
class Program
{
private const string strUrlCur = "http://www.tielu.org/TrainList/";
//static private string[] arrTrLineColumnName = new string[] { "No","车站","火车票交流","酒店机票", "日期", "停车时间", "开车时间", "里程", "硬座", "硬卧中铺", "软座", "软卧下铺" ,"高级软卧下铺"};
private string[] arrDescription = new string[] { "运行时间","始发站","终点站","发车时间","到站时间","类型","全程"};
[STAThread]
static void Main(string[] args)
{
string[] arrUrl = new string[27];
for (int i = 0; i <arrUrl.Length; i++ )
{
int num = i + 1;
arrUrl[i] = strUrlCur + "TrainList-" +num.ToString() + ".html";
}
string strXmlFilename = "train.xml";
XmlDocument xmlTrain = new XmlDocument();
parseTrainListHtml(arrUrl, xmlTrain);
}
private static Boolean parseTrainListHtml(string[] arrUrl, XmlDocument xmlTrain)
{
System.Net.WebClient client = new WebClient();
XmlElement trainLineInfoElement = xmlTrain.createElement_x("TrainLineInfo");
foreach (string strUrl in arrUrl)
{
string strContent = client.DownloadString(strUrl);
string regex = "../Search/[A-Z0-9-]{1,25}\\.html";
Regex re = new Regex(regex);
MatchCollection matches = re.Matches(strContent);
System.Collections.IEnumerator enu = matches.GetEnumerator();
while (enu.MoveNext() && enu.Current != null)
{
Match match = (Match)(enu.Current);
Console.Write(match.Value + "\r\n");
string strTrainlineContent = client.DownloadString(strUrlCur + match.Value);
Lexer lexer = new Lexer(strTrainlineContent);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse(null);
INode htmlNode = htmlNodes[3];
htmlNode = htmlNode.Children[?].Children[?].Children[?].Children[?].Children[?];
INode desNode = htmlNode.Children[6];
INode lineNode = htmlNode.Children[8];
//新建element
XmlElement eleTrainline = xmlTrain.createElement_x("TrainLine");
XmlElement tmpElem;
if (desNode is TableTag)
{
TableTag tag = (desNode as TableTag);
if (!tag.IsEndTag())
{
for (int i = 0; i < tag.RowCount; i++)
{
TableRow tableRow = tag.GetRow(i);
string[] strColumn = new string[4];
for (int j = tableRow.ColumnCount - 4, k = 0; j < tableRow.ColumnCount; j++, k++)
{
TableColumn tableColumn = tableRow.Columns[j];
strColumn[k] = tableColumn.ToPlainTextString();
}
for (int j = 0; 2 * j < 4; j++ )
{
tmpElem = xmlTrain.createElement_x(strColumn[2 * j]);
tmpElem.InnerText = strColumn[2 * j + 1];
eleTrainline.AppendChild(tmpElem);
}
trainLineInfoElement.AppendChild(eleTrainline);
}
}
}
//各车站信息
int[] arrActiveIndex = new int[] {1,4,5,6,7,8,9,10,11,12,13,14,15};
if (lineNode is TableTag)
{
TableTag tag = (lineNode as TableTag);
if (!tag.IsEndTag())
{
string[] arrTrLineColumnName = new string[tag.GetRow(0).ColumnCount];
{
TableRow tableRow = tag.GetRow(0);
for (int j = 0; j < tableRow.ColumnCount; j++ )
{
TableColumn tableColumn = tableRow.Columns[j];
arrTrLineColumnName[j] = tableColumn.ToPlainTextString();
}
}
for (int i = 1; i < tag.RowCount; i++ )
{
TableRow tableRow = tag.GetRow(i);
string[] arrTrlineColumn = new string[13];
int curTrainLineColumnCount = tableRow.ColumnCount;
//有些车次居然没有价格表,然后有的车次还有高级软卧下铺
//Trace.Assert(tableRow.ColumnCount == 12, "html源码中 车站row的td数不为12");
for (int j = 0; j < tableRow.ColumnCount; j++ )
{
TableColumn tableColumn = tableRow.Columns[j];
arrTrlineColumn[j] = tableColumn.ToPlainTextString();
}
XmlElement xmlTrainStation = xmlTrain.createElement_x("TrainStationInfo");
eleTrainline.AppendChild(xmlTrainStation);
for (int j = 0; j < arrActiveIndex.Length && arrActiveIndex[j] < curTrainLineColumnCount; j++)
{
tmpElem = xmlTrain.createElement_x(arrTrLineColumnName[arrActiveIndex[j]]);
tmpElem.InnerText = arrTrlineColumn[arrActiveIndex[j]];
xmlTrainStation.AppendChild(tmpElem);
}
}
}
}
trainLineInfoElement.AppendChild(eleTrainline);
xmlTrain.AppendChild(trainLineInfoElement);
}
}
xmlTrain.Save("E:/abc.xml");
return true;
}
}
}