本文是参照摩诘的Blog
今天遇到这样一个问题,从政府网站中,根据一个关键数据KeyData,提取相关数据。
这个问题可分为三部分解决:
1)取得政府网站交互的方法;
2)按照合适的方法,用HttpWebResponse,取得相关数据
3)分析取回来的数据

第一部分:获取网站交互信息,采用工具ieHTTPHeadersSetup.exe
得到的数据如下:
GET /search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20 HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
Accept-Language: zh-cn
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)
Host: http://www.suzhou-logistics.com/
Connection: Keep-Alive

可以看出,
url: http://http://www.suzhou-logistics.com//search.asp?
Data:key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20

也可以直接作为url:http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20

第二部分:根据第一部分的分析,通过HttpWebResponse取HTML
在此就给出一个通用的函数

public static string GetPage(string url, string postData,string encodeType,out string err) 

        


            Stream outstream 
= null

            Stream instream 
= null

            StreamReader sr 
= null

            HttpWebResponse response 
= null

            HttpWebRequest request 
= null

            Encoding encoding 
= Encoding.GetEncoding(encodeType); 

            
byte[] data = encoding.GetBytes(postData); 

            
// 准备请求 

            
try 

            
{    

                
// 设置参数 

                request 
= WebRequest.Create(url) as HttpWebRequest; 

                CookieContainer cookieContainer 
= new CookieContainer(); 

                request.CookieContainer 
= cookieContainer; 

                request.AllowAutoRedirect 
= true

                request.Method 
= "POST"

                request.ContentType 
= "application/x-www-form-urlencoded"

                request.ContentLength 
= data.Length; 

                outstream 
= request.GetRequestStream(); 

                outstream.Write(data,
0,data.Length); 

                outstream.Close(); 

                
//发送请求并获取相应回应数据 

                response 
= request.GetResponse() as HttpWebResponse; 

                
//直到request.GetResponse()程序才开始向目标网页发送Post请求 

                instream 
= response.GetResponseStream(); 

                sr 
= new StreamReader( instream, encoding ); 

                
//返回结果网页(html)代码 

                
string content = sr.ReadToEnd(); 

                err 
= string.Empty; 

                
return content; 

            }
 

            
catch(Exception ex) 

            


                err 
= ex.Message; 

                
return string.Empty; 

            }
 

        }
 



第三部分:分析Html数据,有两个开源软件
SgmlReader与HtmlAgilityPack20,由于本人机器上只有vs2003,无法使用vs2005版本HtmlAgilityPack20。所以下面用SgmlReader来分析。SgmlReader可以将Html解析成格式完整的类似XML数据,可以采用Xpath进行查询,获取我们想要的数据。
取得完整的xml数据后的分析,根据post页面数据格式的不同而有区别。我取的这个页面,主要用了两个DataTable,一个保存一行基本数据,另一个保存多行的状态数据。

public static DataSet ParsePage(string pageContent, string xclpath,string xrpath,out string err)
        
{

            err 
= string.Empty;
            DataSet ds 
= new DataSet();
            DataTable table 
= new DataTable("QueryResult1");
            DataTable table1 
= new DataTable("QueryResult2");

            

            
            StringWriter strWriter 
= null;
            SgmlReader sgmlReader 
= null;
            XmlTextWriter xmlWriter 
= null;

            
try 
            
{
                sgmlReader 
= new SgmlReader();
                sgmlReader.DocType 
= "HTML";
                sgmlReader.InputStream 
= new StringReader(pageContent);
                strWriter 
= new StringWriter();
                xmlWriter 
= new XmlTextWriter(strWriter);
                xmlWriter.Formatting 
= Formatting.Indented;

                sgmlReader.Read();
                
while (!sgmlReader.EOF) 
                
{                    
                    xmlWriter.WriteNode(sgmlReader, 
true);
                    
                }
 
                xmlWriter.Flush();
                xmlWriter.Close();

                
string wellFormedHTML = strWriter.ToString();


                
if(xclpath.Trim().Length == 0)
                    
return ds;
                
                
                XPathDocument doc 
= new XPathDocument(new StringReader(wellFormedHTML));
                XPathNavigator nav 
= doc.CreateNavigator();
                XPathNodeIterator nodes 
= nav.Select(xclpath);

                
int i = 0;
                
while (nodes.MoveNext()) 
                
{
                    
                    
string sNodeText = nodes.Current.Value;
                    
if( i < nodes.Count - 5)
                    
{
                        
if( i< 17)
                        
{
                            
if(table.Columns.Contains(sNodeText))
                            
{
                                sNodeText 
= sNodeText + i.ToString();
                            }

                            table.Columns.Add(sNodeText ,
typeof(string));
                        }

                        
                    }

                    
else
                    
{
                        
if(table1.Columns.Contains(sNodeText))
                        
{
                            sNodeText 
= sNodeText + i.ToString();
                        }

                        table1.Columns.Add(sNodeText ,
typeof(string));
                    }

                    i 
++;
                    
                }


                ds.Tables.Add(table);
                ds.Tables.Add(table1);



                
bool bNext = false;
                nodes 
= nav.Select(xrpath);

                DataRow row 
= table.NewRow();
                table.Rows.Add(row);

                DataRow row1 
= null;                
                
int j = 0;
                
int k = 0;
                
while (nodes.MoveNext()) 
                
{
                    
string nodetext = nodes.Current.Value;

                    
if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext))
                    
{
                        
continue;
                    }


                    
if(!bNext)
                    
{
                        
if ( nodetext == "正在预录入")
                        
{
                            bNext 
= true;
                        }

                    }

                    
if(!bNext)
                    
{                        
                        
if( j < 17)
                        
{
                            row[j] 
= nodetext;
                            j
++;
                        }

                    }

                    
else
                    
{
                        
                        
if( k == 0)
                        
{
                            row1 
= table1.NewRow();
                            table1.Rows.Add(row1);
                        }


                        row1[k] 
= nodetext;
                        k 
= (k + 1% 5;

                    }
    
                }


                err 
= string.Empty;

                
return ds;
            }
 
            
catch (Exception exp) 
            
{
                
                err 
= exp.Message;
                
return ds;
            }


        }

有了上面的代码就可以采用如下方法调用了

private void Button1_Click(object sender, System.EventArgs e)
        
{    

            
string sHtml = string.Empty;
            
string sErr = string.Empty;
            
string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20";
            DataSet ds;

            sHtml 
= WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);

            
if(sErr == string.Empty)
            
{
                
string xcpath = @"/html/table/tr/td/table/tr/td/font/strong";
                
string xrpath = @"/html/table/tr/td/table/tr/td";
                ds 
= WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);

                
if((sErr == string.Empty) && (ds.Tables.Count == 2))
                
{
                    
if(ds.Tables[0].Rows.Count > 0)
                    
{
                        DataGrid1.DataSource 
= ds.Tables[0];
                        DataGrid1.DataBind();
                    }


                    
if( ds.Tables[1].Rows.Count > 0)
                    
{
                        DataGrid2.DataSource 
= ds.Tables[1];
                        DataGrid2.DataBind();
                    }

                }

            }


            
        }




 

其实SgmlReader可以直接完成从URl抓取数据的功能,即将第二部分与第三部分合并。

string SgmlReaderTest(Uri baseUri, string url, TextWriter log, bool upper, bool formatted)
        
{
            
string inputUri = url;
            
            
try 
            
{
                SgmlReader r 
= new SgmlReader();
                r.SetBaseUri(Server.MapPath(
"."));
                r.DocType 
= "HTML";
                r.Href 
= url;
                
if (upper) r.CaseFolding = CaseFolding.ToUpper;
                StringWriter sw 
= new StringWriter();
                XmlTextWriter w 
= new XmlTextWriter(sw);

                
if (formatted) 
                
{
                    w.Formatting 
= Formatting.Indented;
                    r.WhitespaceHandling 
= WhitespaceHandling.None;
                }

                r.Read();
                
while (!r.EOF) 
                
{
                    
                    w.WriteNode(r, 
true);
                }

                w.Flush();
                w.Close();
                
return sw.ToString();
            }
 
            
catch (Exception e) 
            
{
                
return e.ToString();
            }

        }
Posted on 2009-05-24 15:52  hesen  阅读(847)  评论(0编辑  收藏  举报