本文是参照摩诘的Blog
今天遇到这样一个问题,从政府网站中,根据一个关键数据KeyData,提取相关数据。
这个问题可分为三部分解决:
1)取得政府网站交互的方法;
2)按照合适的方法,用HttpWebResponse,取得相关数据
3)分析取回来的数据
第一部分:获取网站交互信息,采用工具ieHTTPHeadersSetup.exe
得到的数据如下:
GET /search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20 HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
Accept-Language: zh-cn
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)
Host: www.suzhou-logistics.com
Connection: Keep-Alive
可以看出,
url: http://www.suzhou-logistics.com/search.asp?
Data:key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
也可以直接作为url:http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
第二部分:根据第一部分的分析,通过HttpWebResponse取HTML
在此就给出一个通用的函数
public static string GetPage(string url, string postData,string encodeType,out string err)


{

Stream outstream = null;

Stream instream = null;

StreamReader sr = null;

HttpWebResponse response = null;

HttpWebRequest request = null;

Encoding encoding = Encoding.GetEncoding(encodeType);

byte[] data = encoding.GetBytes(postData);

// 准备请求

try


{

// 设置参数

request = WebRequest.Create(url) as HttpWebRequest;

CookieContainer cookieContainer = new CookieContainer();

request.CookieContainer = cookieContainer;

request.AllowAutoRedirect = true;

request.Method = "POST";

request.ContentType = "application/x-www-form-urlencoded";

request.ContentLength = data.Length;

outstream = request.GetRequestStream();

outstream.Write(data,0,data.Length);

outstream.Close();

//发送请求并获取相应回应数据

response = request.GetResponse() as HttpWebResponse;

//直到request.GetResponse()程序才开始向目标网页发送Post请求

instream = response.GetResponseStream();

sr = new StreamReader( instream, encoding );

//返回结果网页(html)代码

string content = sr.ReadToEnd();

err = string.Empty;

return content;

}

catch(Exception ex)


{

err = ex.Message;

return string.Empty;

}

}
第三部分:分析Html数据,有两个开源软件
SgmlReader与HtmlAgilityPack20,由于本人机器上只有vs2003,无法使用vs2005版本HtmlAgilityPack20。所以下面用SgmlReader来分析。SgmlReader可以将Html解析成格式完整的类似XML数据,可以采用Xpath进行查询,获取我们想要的数据。
取得完整的xml数据后的分析,根据post页面数据格式的不同而有区别。我取的这个页面,主要用了两个DataTable,一个保存一行基本数据,另一个保存多行的状态数据。
public static DataSet ParsePage(string pageContent, string xclpath,string xrpath,out string err)

{

err = string.Empty;
DataSet ds = new DataSet();
DataTable table = new DataTable("QueryResult1");
DataTable table1 = new DataTable("QueryResult2");


StringWriter strWriter = null;
SgmlReader sgmlReader = null;
XmlTextWriter xmlWriter = null;

try

{
sgmlReader = new SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.InputStream = new StringReader(pageContent);
strWriter = new StringWriter();
xmlWriter = new XmlTextWriter(strWriter);
xmlWriter.Formatting = Formatting.Indented;

sgmlReader.Read();
while (!sgmlReader.EOF)

{
xmlWriter.WriteNode(sgmlReader, true);
}
xmlWriter.Flush();
xmlWriter.Close();

string wellFormedHTML = strWriter.ToString();


if(xclpath.Trim().Length == 0)
return ds;
XPathDocument doc = new XPathDocument(new StringReader(wellFormedHTML));
XPathNavigator nav = doc.CreateNavigator();
XPathNodeIterator nodes = nav.Select(xclpath);

int i = 0;
while (nodes.MoveNext())

{
string sNodeText = nodes.Current.Value;
if( i < nodes.Count - 5)

{
if( i< 17)

{
if(table.Columns.Contains(sNodeText))

{
sNodeText = sNodeText + i.ToString();
}
table.Columns.Add(sNodeText ,typeof(string));
}
}
else

{
if(table1.Columns.Contains(sNodeText))

{
sNodeText = sNodeText + i.ToString();
}
table1.Columns.Add(sNodeText ,typeof(string));
}
i ++;
}

ds.Tables.Add(table);
ds.Tables.Add(table1);



bool bNext = false;
nodes = nav.Select(xrpath);

DataRow row = table.NewRow();
table.Rows.Add(row);

DataRow row1 = null;
int j = 0;
int k = 0;
while (nodes.MoveNext())

{
string nodetext = nodes.Current.Value;

if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext))

{
continue;
}

if(!bNext)

{
if ( nodetext == "正在预录入")

{
bNext = true;
}
}
if(!bNext)

{
if( j < 17)

{
row[j] = nodetext;
j++;
}
}
else

{
if( k == 0)

{
row1 = table1.NewRow();
table1.Rows.Add(row1);
}

row1[k] = nodetext;
k = (k + 1) % 5;

}
}

err = string.Empty;

return ds;
}
catch (Exception exp)

{
err = exp.Message;
return ds;
}

}
有了上面的代码就可以采用如下方法调用了
private void Button1_Click(object sender, System.EventArgs e)

{

string sHtml = string.Empty;
string sErr = string.Empty;
string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20";
DataSet ds;

sHtml = WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);

if(sErr == string.Empty)

{
string xcpath = @"/html/table/tr/td/table/tr/td/font/strong";
string xrpath = @"/html/table/tr/td/table/tr/td";
ds = WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);

if((sErr == string.Empty) && (ds.Tables.Count == 2))

{
if(ds.Tables[0].Rows.Count > 0)

{
DataGrid1.DataSource = ds.Tables[0];
DataGrid1.DataBind();
}

if( ds.Tables[1].Rows.Count > 0)

{
DataGrid2.DataSource = ds.Tables[1];
DataGrid2.DataBind();
}
}
}

}
其实SgmlReader可以直接完成从URl抓取数据的功能,即将第二部分与第三部分合并。
string SgmlReaderTest(Uri baseUri, string url, TextWriter log, bool upper, bool formatted)

{
string inputUri = url;
try

{
SgmlReader r = new SgmlReader();
r.SetBaseUri(Server.MapPath("."));
r.DocType = "HTML";
r.Href = url;
if (upper) r.CaseFolding = CaseFolding.ToUpper;
StringWriter sw = new StringWriter();
XmlTextWriter w = new XmlTextWriter(sw);

if (formatted)

{
w.Formatting = Formatting.Indented;
r.WhitespaceHandling = WhitespaceHandling.None;
}
r.Read();
while (!r.EOF)

{
w.WriteNode(r, true);
}
w.Flush();
w.Close();
return sw.ToString();
}
catch (Exception e)

{
return e.ToString();
}
}