获取网页源代码

公告

一个小需求，获取远程页面的源码，主要用于抓数据。原来用的好好的，最近突然不能获取页面源码了，但是仍然可以用浏览器正常浏览。（文后附源码下载。^_^）

　　经过分析，原来用的代码如下：

view plain copy to clipboard print ?

StreamReader sreader = null ;
string result = string .Empty;
try
{
HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(Url);
// httpWebRequest.Timeout = 20;
httpWebRequest.KeepAlive = false ;
#endregion
HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
if (httpWebResponse.StatusCode == HttpStatusCode.OK)
{
sreader = new StreamReader(httpWebResponse.GetResponseStream(), encoding);
result = reader.ReadToEnd();
if ( null != httpWebResponse) { httpWebResponse.Close(); }
return result;
}
return result; ;
}
catch (WebException e) { return null ; }
finally { if (sreader != null ) { sreader.Close(); } }

查了下资料，原来需要加参数。

view plain copy to clipboard print ?

　#region 关键参数，否则会取不到内容　Important Parameters,else get nothing.
httpWebRequest.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
httpWebRequest.Accept = "*/*";
httpWebRequest.KeepAlive = true;
httpWebRequest.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
#endregion

修正后的代码如下：

view plain copy to clipboard print ?

#region 读取页面详细信息
/**/ /// <summary> /// 读取页面详细信息
/// </summary>
/// <param name="Url"> 需要读取的地址 </param>
/// <param name="encoding"> 读取的编码方式 </param>
/// <returns></returns>
public static string GetStringByUrl( string Url, System.Text.Encoding encoding)
{
if (Url.Equals( " about:blank " )) return null ; ;
if ( ! Url.StartsWith( " http:// " ) && ! Url.StartsWith( " https:// " )) { Url = " http:// " + Url; }
int dialCount = 0 ;
loop:
StreamReader sreader = null ;
string result = string .Empty;
try
{
HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(Url);
// httpWebRequest.Timeout = 20;
关键参数，否则会取不到内容　Important Parameters,else get nothing. #region 关键参数，否则会取不到内容　Important Parameters,else get nothing.
httpWebRequest.UserAgent = " User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;) " ;
httpWebRequest.Accept = " */* " ;
httpWebRequest.KeepAlive = true ;
httpWebRequest.Headers.Add( " Accept-Language " , " zh-cn,en-us;q=0.5 " );
#endregion
HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
if (httpWebResponse.StatusCode == HttpStatusCode.OK)
{
sreader = new StreamReader(httpWebResponse.GetResponseStream(), encoding);
char [] cCont = new char [ 256 ];
int count = sreader.Read(cCont, 0 , 256 );
while (count > 0 )
{ // Dumps the 256 characters on a string and displays the string to the console.
String str = new String(cCont, 0 , count);
result += str;
count = sreader.Read(cCont, 0 , 256 );
}
}
if ( null != httpWebResponse) { httpWebResponse.Close(); }
return result;
}
catch (WebException e)
{
if (e.Status == WebExceptionStatus.ConnectFailure) { dialCount ++ ; ReDial(); }
if (dialCount < 5 ) { goto loop; }
return null ;
}
finally { if (sreader != null ) { sreader.Close(); } }
}
#endregion
public static void ReDial()
{
int res = 1 ;
/**/ /// /while (res != 0)
/// /{
/// / CSDNWebTest.RASDisplay ras = new RASDisplay();
/// / ras.Disconnect();
/// / res = ras.Connect("asdl");
/// / System.Threading.Thread.Sleep(TimeSpan.FromSeconds(10));
/// /}
}

#region  读取页面详细信息
         /**/ ///   <summary>    ///  读取页面详细信息 
         ///   </summary>   
         /// <param name="Url"> 需要读取的地址 </param>  
         ///   <param name="encoding"> 读取的编码方式 </param>   
         ///   <returns></returns>   
         public   static   string  GetStringByUrl( string  Url, System.Text.Encoding encoding)
         {
             if  (Url.Equals( " about:blank " ))  return   null ; ;
             if  ( ! Url.StartsWith( " http:// " )  &&   ! Url.StartsWith( " https:// " ))  { Url  =   " http:// "   +  Url; }
             int  dialCount  =   0 ;
        loop:
            StreamReader sreader  =   null ;
             string  result  =   string .Empty;
             try
             {
                HttpWebRequest httpWebRequest  =  (HttpWebRequest)WebRequest.Create(Url);
                 // httpWebRequest.Timeout = 20;
                 关键参数，否则会取不到内容　Important Parameters,else get nothing. #region  关键参数，否则会取不到内容　Important Parameters,else get nothing.
                httpWebRequest.UserAgent  =   " User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;) " ;
                httpWebRequest.Accept  =   " */* " ;
                httpWebRequest.KeepAlive  =   true ;
                httpWebRequest.Headers.Add( " Accept-Language " ,  " zh-cn,en-us;q=0.5 " );
                 #endregion
                HttpWebResponse httpWebResponse  =  (HttpWebResponse)httpWebRequest.GetResponse();
                 if  (httpWebResponse.StatusCode  ==  HttpStatusCode.OK)
                 {
                    sreader  =   new  StreamReader(httpWebResponse.GetResponseStream(), encoding);
                     char [] cCont  =   new   char [ 256 ];
                     int  count  =  sreader.Read(cCont,  0 ,  256 );
                     while  (count  >   0 )
                     {  //  Dumps the 256 characters on a string and displays the string to the console. 
                        String str  =   new  String(cCont,  0 , count);
                        result  +=  str;
                        count  =  sreader.Read(cCont,  0 ,  256 );
                    }
                }
                 if  ( null   !=  httpWebResponse)  { httpWebResponse.Close(); }
                 return  result; 
            }
             catch  (WebException e)
             {
                 if  (e.Status  ==  WebExceptionStatus.ConnectFailure)  { dialCount ++ ; ReDial(); }
                 if  (dialCount  <   5 )  {  goto  loop; }
                 return   null ;
            }
             finally   {  if  (sreader  !=   null )  { sreader.Close(); }  }
        }
         #endregion
         public   static   void  ReDial()
         {
             int  res  =   1 ;
             /**/ /// /while (res != 0)
             /// /{
             /// /    CSDNWebTest.RASDisplay ras = new RASDisplay();
             /// /    ras.Disconnect();
             /// /    res = ras.Connect("asdl");
             /// /    System.Threading.Thread.Sleep(TimeSpan.FromSeconds(10));
             /// /}
        }

问题是解决了，后来再想了想，可以用WebClient先把页面download到本地临时文件，再读取文本内容。

代码如下：

view plain copy to clipboard print ?

private string GetPageByWebClient( string url)
{
string result = null ;
if (url.Equals( " about:blank " )) return null ; ;
if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }
string filename = RandomKey( 1111 , 9999 ) + " .txt " ;
DownloadOneFileByURLWithWebClient(filename, url, " C:\\ " );
StreamReader sr = new StreamReader( " c:\\ " + filename, System.Text.Encoding.Default);
try { result = sr.ReadToEnd(); return result; }
catch { return null ; }
finally
{
if (sr != null ) { sr.Close(); }
}
}
private string RandomKey( int b, int e)
{
return DateTime.Now.ToString( " yyyyMMdd-HHmmss-fff- " ) + this .getRandomID(b, e);
}
private int getRandomID( int minValue, int maxValue)
{
Random ri = new Random( unchecked (( int )DateTime.Now.Ticks));
int k = ri.Next(minValue, maxValue);
return k;
}
private string GuidString
{
get { return Guid.NewGuid().ToString(); }
}
　　　　 /**/ /// Web Client Method ,only For Small picture
/// </summary>
/// <param name="fileName"></param>
/// <param name="url"></param>
/// <param name="localPath"></param>
public static void DownloadOneFileByURLWithWebClient( string fileName, string url, string localPath)
{
System.Net.WebClient wc = new System.Net.WebClient();
if (File.Exists(localPath + fileName)) { File.Delete(localPath + fileName); }
if (Directory.Exists(localPath) == false ) { Directory.CreateDirectory(localPath); }
wc.DownloadFile(url + fileName, localPath + fileName);
}

private   string  GetPageByWebClient( string  url)
         {
             string  result  =   null ;
             if  (url.Equals( " about:blank " ))  return   null ; ;
             if  ( ! url.StartsWith( " http:// " )  &&   ! url.StartsWith( " https:// " ))  { url  =   " http:// "   +  url; }
             string  filename  =  RandomKey( 1111 ,  9999 )  +   " .txt " ;
            DownloadOneFileByURLWithWebClient(filename, url,  " C:\\ " );
            StreamReader sr  =   new  StreamReader( " c:\\ "   +  filename, System.Text.Encoding.Default);
             try   { result  =  sr.ReadToEnd();  return  result; }
             catch   {  return   null ; }
             finally
             {
                 if  (sr  !=   null )  { sr.Close(); }
            }
        }
         private   string  RandomKey( int  b,  int  e)
         {
             return  DateTime.Now.ToString( " yyyyMMdd-HHmmss-fff- " )  +   this .getRandomID(b, e);
        }
         private   int  getRandomID( int  minValue,  int  maxValue)
         {
            Random ri  =   new  Random( unchecked (( int )DateTime.Now.Ticks));
             int  k  =  ri.Next(minValue, maxValue);
             return  k;
        }
         private   string  GuidString
         {
             get   {  return  Guid.NewGuid().ToString(); }
        }
　　　　 /**/ /// Web Client Method ,only For Small picture
         ///   </summary>
         ///   <param name="fileName"></param>
         ///   <param name="url"></param>
         ///   <param name="localPath"></param>
         public   static   void  DownloadOneFileByURLWithWebClient( string  fileName,  string  url,  string  localPath)
         {
            System.Net.WebClient wc  =   new  System.Net.WebClient();
             if  (File.Exists(localPath  +  fileName))  { File.Delete(localPath  +  fileName); }
             if  (Directory.Exists(localPath)  ==   false )  { Directory.CreateDirectory(localPath); }
            wc.DownloadFile(url  +  fileName, localPath  +  fileName);
        }

结果不能获取源码。错误如下：邀月工作室

再想想，还有Webbrowser控件可以用啊。在WinFrom下只要在主线程前加[STAThread]即可。

view plain copy to clipboard print ?

[STAThread]
public void GetURLContentByWebBrowser()
{
try
{
// webBrowser1 = new WebBrowser();
string url = txtUrl.Text.Trim();
string result = null ;
WebBrowser wb = new WebBrowser();
/**/ /// /if (wb != null){ wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted); }
if (String.IsNullOrEmpty(url)) return ;
if (url.Equals( " about:blank " )) return ; ;
if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }
try
{
wb.Navigate( new Uri(url));
result = wb.DocumentText;
lbResult.Text = result;
}
catch (System.UriFormatException)
{ }
return ;
}
catch (Exception ex)
{
// WriteLog.Writelog("这是获取页面全部html代码时发生的错误：" + url, ex);
throw ex;
// return ;
}
}

[STAThread]
         public   void  GetURLContentByWebBrowser()
         {
             try
             {
                 // webBrowser1 = new WebBrowser();
                 string  url  =  txtUrl.Text.Trim();
                 string  result  =   null ;
                WebBrowser wb  =   new  WebBrowser();
                 /**/ /// /if (wb != null){ wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted); }
                 if  (String.IsNullOrEmpty(url))  return ;
                 if  (url.Equals( " about:blank " ))  return ; ;
                 if  ( ! url.StartsWith( " http:// " )  &&   ! url.StartsWith( " https:// " ))  { url  =   " http:// "   +  url; }
                 try
                 {
                    wb.Navigate( new  Uri(url));
                    result  =  wb.DocumentText;
                    lbResult.Text  =  result;
                }
                 catch  (System.UriFormatException)
                 { }
                 return ;
            }
             catch  (Exception ex)
             {
                 // WriteLog.Writelog("这是获取页面全部html代码时发生的错误：" + url, ex); 
                 throw  ex;
                 // return ;
            }
        }

在WebForm就麻烦些了，出现错误，线程不在单线程单元中，故无法实例化 ActiveX 控件“8856f961-340a-11d0-a96b-00c04fd705a2”

邀月工作室

代码如下：

view plain copy to clipboard print ?

private string GetPageStringbyWebBrowser( string url)
{
if (url.Equals( " about:blank " )) return null ; ;
if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }
WebBrowser myWB = new WebBrowser();
myWB.ScrollBarsEnabled = false ;
myWB.Navigate(url);
while (myWB.ReadyState != WebBrowserReadyState.Complete)
{
System.Windows.Forms.Application.DoEvents();
}
if (myWB != null )
{
System.IO.StreamReader getReader = null ;
try
{
getReader = new System.IO.StreamReader(myWB.DocumentStream, System.Text.Encoding.GetEncoding(myWB.Document.Encoding));
string gethtml = getReader.ReadToEnd();
return gethtml;
}
catch { return null ; }
finally
{
if (getReader != null ) { getReader.Close(); }
myWB.Dispose();
}
}
return null ;
}

private   string  GetPageStringbyWebBrowser( string  url)
         {
             if  (url.Equals( " about:blank " ))  return   null ; ;
             if  ( ! url.StartsWith( " http:// " )  &&   ! url.StartsWith( " https:// " ))  { url  =   " http:// "   +  url; }
            WebBrowser myWB  =   new  WebBrowser();
            myWB.ScrollBarsEnabled  =   false ;
            myWB.Navigate(url);
             while  (myWB.ReadyState  !=  WebBrowserReadyState.Complete)
             {
                System.Windows.Forms.Application.DoEvents();
            }
             if  (myWB  !=   null )
             {
                System.IO.StreamReader getReader  =   null ;
                 try
                 {
                    getReader  =   new  System.IO.StreamReader(myWB.DocumentStream, System.Text.Encoding.GetEncoding(myWB.Document.Encoding));
                     string  gethtml  =  getReader.ReadToEnd();
                     return  gethtml;
                }
                 catch   {  return   null ; }
                 finally
                 {
                     if  (getReader  !=   null )  { getReader.Close(); }
                    myWB.Dispose();
                }
            }
             return   null ;
        }

后来搜索N小时(N>=5)后，终于找到可行解决方案,在WebPage页面头部加入AspCompat="true"

即<%@ Page Language="C#" AspCompat="true" ******/>

MSDN给出的解释是：
在 ASP .NET 网页的 <%@Page> 标记中包含兼容性属性 aspcompat=true，如 <%@Page aspcompat=true Language=VB%>。使用此属性将强制网页以 STA 模式执行，从而确保您的组件可以继续正确运行。如果试图使用 STA 组件但没有指定此标记，运行时将会发生异常情况。

将此属性的值设置为 true 时，将允许网页调用 COM+ 1.0 组件，该组件需要访问非管理的 ASP 内置对象。可以通过 ObjectContext 对象进行访问。

如果将此标记的值设为 true，性能会稍微有些下降。建议只在确实需要时才这样做。

终于可以了！　不知道有没有更好的方法？？

邀月工作室

附：源码下载。

邀月注：

如果不能测试，请注意是否在域（AD)环境下，如果是！　请注意设置代理和防火墙

posted on 2009-08-18 15:40 夏雷阅读(1147) 评论(0) 编辑收藏举报

刷新页面返回顶部