利用WebRequest登录网站,抓取信息
以前也做过类似于新闻小偷之类的东西,就是利用WebRequest从请求的URL获取信息后进行分析,然后显示。记得当时老板让做一个搜索时抓取程序,想把这边几个BBS的信息都能纳入搜索范围。因为有些BBS的搜索功能必须登录后才能使用,所以当时我想破了脑子也没办法,最后认为这个任务根本没法做,就没往下思考。今天在CSDN上看到一个帖子,才知道原来这并非不可实现的,其实早己有人这样做过。要做到这些,其要点有两个:
1、通过附加一个cookiecontainer到httprequest对象中,可以得到登录后返回的代表SESSION ID的COOKIE。
1、通过附加一个cookiecontainer到httprequest对象中,可以得到登录后返回的代表SESSION ID的COOKIE。
2 、将此COOKIE包含在一个cookiecontainer中并附加到另一个HTTPREQUEST请求中,则可以实现SESSION的还原。
using System; using System.Collections; using System.ComponentModel; using System.Data; //using System.Data.OleDb; using System.Drawing; using System.Web; using System.Web.SessionState; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.HtmlControls; using System.Net; using System.IO; using System.Text; using System.Text.RegularExpressions; using Microsoft.Data.Odbc; namespace PdfTest { /// <summary> /// Summary description for WebForm1. /// </summary> public class getHttpInfo : System.Web.UI.Page { protected static string cookieheader; private void Page_Load(object sender, System.EventArgs e) { // Put user code to initialize the page here string strResult; if (HttpContext.Current.Application["cookieheader"] != null) { cookieheader = (string) HttpContext.Current.Application["cookieheader"]; } else { //Login into the website and keep the cookie for the session in the application variable string strLogin = Login("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&USERID=&Password="); } strResult = getPage("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&data="); //Write the result to htm file FileStream htmFile = new FileStream(@"c:\save.htm", FileMode.OpenOrCreate); StreamWriter sw = new StreamWriter(htmFile); sw.Write(strResult); sw.Close(); htmFile.Close(); // output the result Response.Write(strResult); } public static string Login(String url, String paramList) { HttpWebResponse res = null; string strResult = ""; try { HttpWebRequest req = (HttpWebRequest) WebRequest.Create(url); req.Method = "POST"; req.ContentType = "application/x-www-form-urlencoded"; req.AllowAutoRedirect = false; CookieContainer cookieCon = new CookieContainer(); req.CookieContainer = cookieCon; StringBuilder UrlEncoded = new StringBuilder(); Char[] reserved = { '?', '=', '&' }; byte[] SomeBytes = null; if (paramList != null) { int i = 0, j; while (i < paramList.Length) { j = paramList.IndexOfAny(reserved, i); if (j == -1) { UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length - i))); break; } UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j - i))); UrlEncoded.Append(paramList.Substring(j, 1)); i = j + 1; } SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString()); req.ContentLength = SomeBytes.Length; Stream newStream = req.GetRequestStream(); newStream.Write(SomeBytes, 0, SomeBytes.Length); newStream.Close(); } else { req.ContentLength = 0; } res = (HttpWebResponse) req.GetResponse(); cookieheader = req.CookieContainer.GetCookieHeader(new Uri(url)); HttpContext.Current.Application.Lock(); HttpContext.Current.Application["cookieheader"] = cookieheader; HttpContext.Current.Application.UnLock(); Stream ReceiveStream = res.GetResponseStream(); Encoding encode = System.Text.Encoding.GetEncoding("utf-8"); StreamReader sr = new StreamReader(ReceiveStream, encode); Char[] read = new Char[256]; int count = sr.Read(read, 0, 256); while (count > 0) { String str = new String(read, 0, count); strResult += str; count = sr.Read(read, 0, 256); } } catch (Exception e) { strResult = e.ToString(); } finally { if (res != null) { res.Close(); } } return strResult; } public static string getPage(String url, String paramList) { HttpWebResponse res = null; string strResult = ""; try { HttpWebRequest req = (HttpWebRequest) WebRequest.Create(url); req.Method = "POST"; req.KeepAlive = true; req.ContentType = "application/x-www-form-urlencoded"; CookieContainer cookieCon = new CookieContainer(); req.CookieContainer = cookieCon; req.CookieContainer.SetCookies(new Uri(url), cookieheader); StringBuilder UrlEncoded = new StringBuilder(); Char[] reserved = { '?', '=', '&' }; byte[] SomeBytes = null; if (paramList != null) { int i = 0, j; while (i < paramList.Length) { j = paramList.IndexOfAny(reserved, i); if (j == -1) { UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length - i))); break; } UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j - i))); UrlEncoded.Append(paramList.Substring(j, 1)); i = j + 1; } SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString()); req.ContentLength = SomeBytes.Length; Stream newStream = req.GetRequestStream(); newStream.Write(SomeBytes, 0, SomeBytes.Length); newStream.Close(); } else { req.ContentLength = 0; } res = (HttpWebResponse) req.GetResponse(); Stream ReceiveStream = res.GetResponseStream(); Encoding encode = System.Text.Encoding.GetEncoding("utf-8"); StreamReader sr = new StreamReader(ReceiveStream, encode); Char[] read = new Char[256]; int count = sr.Read(read, 0, 256); while (count > 0) { String str = new String(read, 0, count); strResult += str; count = sr.Read(read, 0, 256); } } catch (Exception e) { strResult = e.ToString(); } finally { if (res != null) { res.Close(); } } return strResult; } #region Web Form Designer generated code override protected void OnInit(EventArgs e) { // // CODEGEN: This call is required by the ASP.NET Web Form Designer. // InitializeComponent(); base.OnInit(e); } /// <summary> /// Required method for Designer support - do not modify /// the contents of this method with the code editor. /// </summary> private void InitializeComponent() { this.Load += new System.EventHandler(this.Page_Load); } #endregion } }