本文是参照摩诘的Blog
今天遇到这样一个问题,从政府网站中,根据一个关键数据KeyData,提取相关数据。
这个问题可分为三部分解决:
1)取得政府网站交互的方法;
2)按照合适的方法,用HttpWebResponse,取得相关数据
3)分析取回来的数据
第一部分:获取网站交互信息,采用工具ieHTTPHeadersSetup.exe
得到的数据如下:
GET /search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20 HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
Accept-Language: zh-cn
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)
Host: www.suzhou-logistics.com
Connection: Keep-Alive
可以看出,
url: http://www.suzhou-logistics.com/search.asp?
Data:key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
也可以直接作为url:http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
第二部分:根据第一部分的分析,通过HttpWebResponse取HTML
在此就给出一个通用的函数
public static string GetPage(string url, string postData,string encodeType,out string err)
![](/Images/OutliningIndicators/None.gif)
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
![](/Images/OutliningIndicators/InBlock.gif)
Stream outstream = null;
![](/Images/OutliningIndicators/InBlock.gif)
Stream instream = null;
![](/Images/OutliningIndicators/InBlock.gif)
StreamReader sr = null;
![](/Images/OutliningIndicators/InBlock.gif)
HttpWebResponse response = null;
![](/Images/OutliningIndicators/InBlock.gif)
HttpWebRequest request = null;
![](/Images/OutliningIndicators/InBlock.gif)
Encoding encoding = Encoding.GetEncoding(encodeType);
![](/Images/OutliningIndicators/InBlock.gif)
byte[] data = encoding.GetBytes(postData);
![](/Images/OutliningIndicators/InBlock.gif)
// 准备请求
![](/Images/OutliningIndicators/InBlock.gif)
try
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
![](/Images/OutliningIndicators/InBlock.gif)
// 设置参数
![](/Images/OutliningIndicators/InBlock.gif)
request = WebRequest.Create(url) as HttpWebRequest;
![](/Images/OutliningIndicators/InBlock.gif)
CookieContainer cookieContainer = new CookieContainer();
![](/Images/OutliningIndicators/InBlock.gif)
request.CookieContainer = cookieContainer;
![](/Images/OutliningIndicators/InBlock.gif)
request.AllowAutoRedirect = true;
![](/Images/OutliningIndicators/InBlock.gif)
request.Method = "POST";
![](/Images/OutliningIndicators/InBlock.gif)
request.ContentType = "application/x-www-form-urlencoded";
![](/Images/OutliningIndicators/InBlock.gif)
request.ContentLength = data.Length;
![](/Images/OutliningIndicators/InBlock.gif)
outstream = request.GetRequestStream();
![](/Images/OutliningIndicators/InBlock.gif)
outstream.Write(data,0,data.Length);
![](/Images/OutliningIndicators/InBlock.gif)
outstream.Close();
![](/Images/OutliningIndicators/InBlock.gif)
//发送请求并获取相应回应数据
![](/Images/OutliningIndicators/InBlock.gif)
response = request.GetResponse() as HttpWebResponse;
![](/Images/OutliningIndicators/InBlock.gif)
//直到request.GetResponse()程序才开始向目标网页发送Post请求
![](/Images/OutliningIndicators/InBlock.gif)
instream = response.GetResponseStream();
![](/Images/OutliningIndicators/InBlock.gif)
sr = new StreamReader( instream, encoding );
![](/Images/OutliningIndicators/InBlock.gif)
//返回结果网页(html)代码
![](/Images/OutliningIndicators/InBlock.gif)
string content = sr.ReadToEnd();
![](/Images/OutliningIndicators/InBlock.gif)
err = string.Empty;
![](/Images/OutliningIndicators/InBlock.gif)
return content;
![](/Images/OutliningIndicators/InBlock.gif)
}
![](/Images/OutliningIndicators/InBlock.gif)
catch(Exception ex)
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
![](/Images/OutliningIndicators/InBlock.gif)
err = ex.Message;
![](/Images/OutliningIndicators/InBlock.gif)
return string.Empty;
![](/Images/OutliningIndicators/InBlock.gif)
}
![](/Images/OutliningIndicators/InBlock.gif)
}
第三部分:分析Html数据,有两个开源软件
SgmlReader与HtmlAgilityPack20,由于本人机器上只有vs2003,无法使用vs2005版本HtmlAgilityPack20。所以下面用SgmlReader来分析。SgmlReader可以将Html解析成格式完整的类似XML数据,可以采用Xpath进行查询,获取我们想要的数据。
取得完整的xml数据后的分析,根据post页面数据格式的不同而有区别。我取的这个页面,主要用了两个DataTable,一个保存一行基本数据,另一个保存多行的状态数据。
public static DataSet ParsePage(string pageContent, string xclpath,string xrpath,out string err)
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
![](/Images/OutliningIndicators/InBlock.gif)
err = string.Empty;
DataSet ds = new DataSet();
DataTable table = new DataTable("QueryResult1");
DataTable table1 = new DataTable("QueryResult2");
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/InBlock.gif)
StringWriter strWriter = null;
SgmlReader sgmlReader = null;
XmlTextWriter xmlWriter = null;
![](/Images/OutliningIndicators/InBlock.gif)
try
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
sgmlReader = new SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.InputStream = new StringReader(pageContent);
strWriter = new StringWriter();
xmlWriter = new XmlTextWriter(strWriter);
xmlWriter.Formatting = Formatting.Indented;
![](/Images/OutliningIndicators/InBlock.gif)
sgmlReader.Read();
while (!sgmlReader.EOF)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
xmlWriter.WriteNode(sgmlReader, true);
}
xmlWriter.Flush();
xmlWriter.Close();
![](/Images/OutliningIndicators/InBlock.gif)
string wellFormedHTML = strWriter.ToString();
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/InBlock.gif)
if(xclpath.Trim().Length == 0)
return ds;
XPathDocument doc = new XPathDocument(new StringReader(wellFormedHTML));
XPathNavigator nav = doc.CreateNavigator();
XPathNodeIterator nodes = nav.Select(xclpath);
![](/Images/OutliningIndicators/InBlock.gif)
int i = 0;
while (nodes.MoveNext())
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
string sNodeText = nodes.Current.Value;
if( i < nodes.Count - 5)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if( i< 17)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if(table.Columns.Contains(sNodeText))
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
sNodeText = sNodeText + i.ToString();
}
table.Columns.Add(sNodeText ,typeof(string));
}
}
else
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if(table1.Columns.Contains(sNodeText))
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
sNodeText = sNodeText + i.ToString();
}
table1.Columns.Add(sNodeText ,typeof(string));
}
i ++;
}
![](/Images/OutliningIndicators/InBlock.gif)
ds.Tables.Add(table);
ds.Tables.Add(table1);
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/InBlock.gif)
![](/Images/OutliningIndicators/InBlock.gif)
bool bNext = false;
nodes = nav.Select(xrpath);
![](/Images/OutliningIndicators/InBlock.gif)
DataRow row = table.NewRow();
table.Rows.Add(row);
![](/Images/OutliningIndicators/InBlock.gif)
DataRow row1 = null;
int j = 0;
int k = 0;
while (nodes.MoveNext())
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
string nodetext = nodes.Current.Value;
![](/Images/OutliningIndicators/InBlock.gif)
if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext))
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
continue;
}
![](/Images/OutliningIndicators/InBlock.gif)
if(!bNext)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if ( nodetext == "正在预录入")
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
bNext = true;
}
}
if(!bNext)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if( j < 17)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
row[j] = nodetext;
j++;
}
}
else
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if( k == 0)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
row1 = table1.NewRow();
table1.Rows.Add(row1);
}
![](/Images/OutliningIndicators/InBlock.gif)
row1[k] = nodetext;
k = (k + 1) % 5;
![](/Images/OutliningIndicators/InBlock.gif)
}
}
![](/Images/OutliningIndicators/InBlock.gif)
err = string.Empty;
![](/Images/OutliningIndicators/InBlock.gif)
return ds;
}
catch (Exception exp)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
err = exp.Message;
return ds;
}
![](/Images/OutliningIndicators/InBlock.gif)
}
有了上面的代码就可以采用如下方法调用了
private void Button1_Click(object sender, System.EventArgs e)
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
![](/Images/OutliningIndicators/InBlock.gif)
string sHtml = string.Empty;
string sErr = string.Empty;
string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20";
DataSet ds;
![](/Images/OutliningIndicators/InBlock.gif)
sHtml = WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);
![](/Images/OutliningIndicators/InBlock.gif)
if(sErr == string.Empty)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
string xcpath = @"/html/table/tr/td/table/tr/td/font/strong";
string xrpath = @"/html/table/tr/td/table/tr/td";
ds = WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);
![](/Images/OutliningIndicators/InBlock.gif)
if((sErr == string.Empty) && (ds.Tables.Count == 2))
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
if(ds.Tables[0].Rows.Count > 0)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
DataGrid1.DataSource = ds.Tables[0];
DataGrid1.DataBind();
}
![](/Images/OutliningIndicators/InBlock.gif)
if( ds.Tables[1].Rows.Count > 0)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
DataGrid2.DataSource = ds.Tables[1];
DataGrid2.DataBind();
}
}
}
![](/Images/OutliningIndicators/InBlock.gif)
}
其实SgmlReader可以直接完成从URl抓取数据的功能,即将第二部分与第三部分合并。
string SgmlReaderTest(Uri baseUri, string url, TextWriter log, bool upper, bool formatted)
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
string inputUri = url;
try
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
SgmlReader r = new SgmlReader();
r.SetBaseUri(Server.MapPath("."));
r.DocType = "HTML";
r.Href = url;
if (upper) r.CaseFolding = CaseFolding.ToUpper;
StringWriter sw = new StringWriter();
XmlTextWriter w = new XmlTextWriter(sw);
![](/Images/OutliningIndicators/InBlock.gif)
if (formatted)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
w.Formatting = Formatting.Indented;
r.WhitespaceHandling = WhitespaceHandling.None;
}
r.Read();
while (!r.EOF)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
w.WriteNode(r, true);
}
w.Flush();
w.Close();
return sw.ToString();
}
catch (Exception e)
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
return e.ToString();
}
}