本文是参照摩诘的Blog
今天遇到这样一个问题,从政府网站中,根据一个关键数据KeyData,提取相关数据。
这个问题可分为三部分解决:
1)取得政府网站交互的方法;
2)按照合适的方法,用HttpWebResponse,取得相关数据
3)分析取回来的数据
第一部分:获取网站交互信息,采用工具ieHTTPHeadersSetup.exe
得到的数据如下:
GET /search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20 HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
Accept-Language: zh-cn
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)
Host: http://www.suzhou-logistics.com/
Connection: Keep-Alive
可以看出,
url: http://http://www.suzhou-logistics.com//search.asp?
Data:key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
也可以直接作为url:http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
第二部分:根据第一部分的分析,通过HttpWebResponse取HTML
在此就给出一个通用的函数
{
Stream outstream = null;
Stream instream = null;
StreamReader sr = null;
HttpWebResponse response = null;
HttpWebRequest request = null;
Encoding encoding = Encoding.GetEncoding(encodeType);
byte[] data = encoding.GetBytes(postData);
// 准备请求
try
{
// 设置参数
request = WebRequest.Create(url) as HttpWebRequest;
CookieContainer cookieContainer = new CookieContainer();
request.CookieContainer = cookieContainer;
request.AllowAutoRedirect = true;
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
outstream = request.GetRequestStream();
outstream.Write(data,0,data.Length);
outstream.Close();
//发送请求并获取相应回应数据
response = request.GetResponse() as HttpWebResponse;
//直到request.GetResponse()程序才开始向目标网页发送Post请求
instream = response.GetResponseStream();
sr = new StreamReader( instream, encoding );
//返回结果网页(html)代码
string content = sr.ReadToEnd();
err = string.Empty;
return content;
}
catch(Exception ex)
{
err = ex.Message;
return string.Empty;
}
}
第三部分:分析Html数据,有两个开源软件
SgmlReader与HtmlAgilityPack20,由于本人机器上只有vs2003,无法使用vs2005版本HtmlAgilityPack20。所以下面用SgmlReader来分析。SgmlReader可以将Html解析成格式完整的类似XML数据,可以采用Xpath进行查询,获取我们想要的数据。
取得完整的xml数据后的分析,根据post页面数据格式的不同而有区别。我取的这个页面,主要用了两个DataTable,一个保存一行基本数据,另一个保存多行的状态数据。
{
err = string.Empty;
DataSet ds = new DataSet();
DataTable table = new DataTable("QueryResult1");
DataTable table1 = new DataTable("QueryResult2");
StringWriter strWriter = null;
SgmlReader sgmlReader = null;
XmlTextWriter xmlWriter = null;
try
{
sgmlReader = new SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.InputStream = new StringReader(pageContent);
strWriter = new StringWriter();
xmlWriter = new XmlTextWriter(strWriter);
xmlWriter.Formatting = Formatting.Indented;
sgmlReader.Read();
while (!sgmlReader.EOF)
{
xmlWriter.WriteNode(sgmlReader, true);
}
xmlWriter.Flush();
xmlWriter.Close();
string wellFormedHTML = strWriter.ToString();
if(xclpath.Trim().Length == 0)
return ds;
XPathDocument doc = new XPathDocument(new StringReader(wellFormedHTML));
XPathNavigator nav = doc.CreateNavigator();
XPathNodeIterator nodes = nav.Select(xclpath);
int i = 0;
while (nodes.MoveNext())
{
string sNodeText = nodes.Current.Value;
if( i < nodes.Count - 5)
{
if( i< 17)
{
if(table.Columns.Contains(sNodeText))
{
sNodeText = sNodeText + i.ToString();
}
table.Columns.Add(sNodeText ,typeof(string));
}
}
else
{
if(table1.Columns.Contains(sNodeText))
{
sNodeText = sNodeText + i.ToString();
}
table1.Columns.Add(sNodeText ,typeof(string));
}
i ++;
}
ds.Tables.Add(table);
ds.Tables.Add(table1);
bool bNext = false;
nodes = nav.Select(xrpath);
DataRow row = table.NewRow();
table.Rows.Add(row);
DataRow row1 = null;
int j = 0;
int k = 0;
while (nodes.MoveNext())
{
string nodetext = nodes.Current.Value;
if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext))
{
continue;
}
if(!bNext)
{
if ( nodetext == "正在预录入")
{
bNext = true;
}
}
if(!bNext)
{
if( j < 17)
{
row[j] = nodetext;
j++;
}
}
else
{
if( k == 0)
{
row1 = table1.NewRow();
table1.Rows.Add(row1);
}
row1[k] = nodetext;
k = (k + 1) % 5;
}
}
err = string.Empty;
return ds;
}
catch (Exception exp)
{
err = exp.Message;
return ds;
}
}
有了上面的代码就可以采用如下方法调用了
{
string sHtml = string.Empty;
string sErr = string.Empty;
string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20";
DataSet ds;
sHtml = WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);
if(sErr == string.Empty)
{
string xcpath = @"/html/table/tr/td/table/tr/td/font/strong";
string xrpath = @"/html/table/tr/td/table/tr/td";
ds = WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);
if((sErr == string.Empty) && (ds.Tables.Count == 2))
{
if(ds.Tables[0].Rows.Count > 0)
{
DataGrid1.DataSource = ds.Tables[0];
DataGrid1.DataBind();
}
if( ds.Tables[1].Rows.Count > 0)
{
DataGrid2.DataSource = ds.Tables[1];
DataGrid2.DataBind();
}
}
}
}
其实SgmlReader可以直接完成从URl抓取数据的功能,即将第二部分与第三部分合并。
{
string inputUri = url;
try
{
SgmlReader r = new SgmlReader();
r.SetBaseUri(Server.MapPath("."));
r.DocType = "HTML";
r.Href = url;
if (upper) r.CaseFolding = CaseFolding.ToUpper;
StringWriter sw = new StringWriter();
XmlTextWriter w = new XmlTextWriter(sw);
if (formatted)
{
w.Formatting = Formatting.Indented;
r.WhitespaceHandling = WhitespaceHandling.None;
}
r.Read();
while (!r.EOF)
{
w.WriteNode(r, true);
}
w.Flush();
w.Close();
return sw.ToString();
}
catch (Exception e)
{
return e.ToString();
}
}