C# 网页信息采集 核心代码收集

1。通过HttpWebResponse 来获取

  1. public static string CheckTeamSiteUrl(string url)  
  2.     {  
  3.         string response = "";  
  4.         HttpWebResponse httpResponse = null;  
  5.   
  6.         //assert: user have access to URL   
  7.         try  
  8.         {  
  9.             HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(url);  
  10.             httpRequest.Headers.Set("Pragma""no-cache");  
  11.   
  12.                 // request.Headers.Set("KeepAlive", "true");  
  13.   
  14.                 httpRequest.CookieContainer = new CookieContainer();  
  15.   
  16.   
  17.   
  18.                 httpRequest.Referer = url;  
  19.   
  20.                 httpRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";  
  21.   
  22.                 
  23.   
  24.             httpRequest.Credentials = System.Net.CredentialCache.DefaultCredentials;  
  25.             httpResponse = (HttpWebResponse)httpRequest.GetResponse();  
  26.               
  27.         }  
  28.         catch (Exception ex)  
  29.         {  
  30.             throw new ApplicationException("HTTP 403 Access denied, URL: " + url, ex);  
  31.         }  
  32.   
  33.         //if here, the URL is correct and the user has access   
  34.         try  
  35.         {  
  36.             string strEncod = httpResponse.ContentType;  
  37.             StreamReader stream;  
  38.             if (strEncod.ToLower().IndexOf("utf") != -1)  
  39.             {  
  40.                 stream = new StreamReader(httpResponse.GetResponseStream(), System.Text.Encoding.UTF8);  
  41.             }  
  42.             else  
  43.             {  
  44.                 stream = new StreamReader(httpResponse.GetResponseStream(), System.Text.Encoding.Default);  
  45.             }  
  46.              
  47.             char[] buff = new char[4000];  
  48.             stream.ReadBlock(buff,0,4000);  
  49.             response = new string(buff);  
  50.             stream.Close();  
  51.             httpResponse.Close();  
  52.         }  
  53.         catch (Exception ex)  
  54.         {  
  55.             throw new ApplicationException("HTTP 404 Page not found, URL: " + url, ex);  
  56.         }  
  57.         return response;  
  58.   
  59.     }  

 

2。通过 WebResponse 来获取

  1.  public static string getPage(String url)  
  2.     {  
  3.         WebResponse result = null;  
  4.         string resultstring = "";  
  5.         try  
  6.         {  
  7.             WebRequest req = WebRequest.Create(url);  
  8.             req.Timeout = 30000;  
  9.             result = req.GetResponse();  
  10.             Stream ReceiveStream = result.GetResponseStream();  
  11.   
  12.             //read the stream into a string  
  13.             //StreamReader sr = new StreamReader(ReceiveStream, System.Text.Encoding.UTF8);  
  14.             string strEncod = result.ContentType;  
  15.             StreamReader sr;  
  16.             if (strEncod.ToLower().IndexOf("utf") != -1)  
  17.             {  
  18.                 sr = new StreamReader(ReceiveStream, System.Text.Encoding.UTF8);  
  19.             }  
  20.             else  
  21.             {  
  22.                 sr = new StreamReader(ReceiveStream, System.Text.Encoding.Default);  
  23.             }  
  24.             resultstring = sr.ReadToEnd();  
  25.             js.alert(resultstring);  
  26.             //Console.WriteLine(resultstring);  
  27.         }  
  28.         catch  
  29.         {  
  30.             throw new Exception();  
  31.         }  
  32.         finally  
  33.         {  
  34.             if (result != null)  
  35.             {  
  36.                 result.Close();  
  37.             }  
  38.         }  
  39.         return resultstring;  
  40.     }  

 

3。通过WebClient来获取

  1.  public string get(int length)  
  2.     {  
  3.         try  
  4.         {  
  5.             getEncodeing();  
  6.             WebClient wb = new WebClient();  
  7.             Stream response = wb.OpenRead(url);  
  8.             StreamReader reader = new StreamReader(response, this.encoding, true256000);  
  9.             char[] a = new char[length];  
  10.             int i  = reader.Read(a,0,length);  
  11.             reader.Close();  
  12.             return new string(a);  
  13.         }  
  14.         catch (Exception e)  
  15.         {  
  16.             return e.Message;  
  17.             //return null;  
  18.         }  
  19.     }  
  20.     private void getEncodeing()  
  21.     {  
  22.         switch (this.encode)  
  23.         {  
  24.             case "UTF-8": encoding = Encoding.UTF8; break;  
  25.             case "GB2312": encoding = Encoding.GetEncoding("GB2312"); break;  
  26.             case "ASCII": encoding = Encoding.ASCII; break;  
  27.             default: encoding = Encoding.GetEncoding(encode); break;  
  28.         }  
  29.     }  

该文转自http://blog.sina.com.cn/s/blog_758087190100rfri.html

posted @ 2012-11-28 19:28  陀螺-  阅读(335)  评论(0编辑  收藏  举报