利用WebRequest登录网站,抓取信息

以前也做过类似于新闻小偷之类的东西,就是利用WebRequest从请求的URL获取信息后进行分析,然后显示。记得当时老板让做一个搜索时抓取程序,想把这边几个BBS的信息都能纳入搜索范围。因为有些BBS的搜索功能必须登录后才能使用,所以当时我想破了脑子也没办法,最后认为这个任务根本没法做,就没往下思考。今天在CSDN上看到一个帖子,才知道原来这并非不可实现的,其实早己有人这样做过。要做到这些,其要点有两个:  

1、通过附加一个cookiecontainer到httprequest对象中,可以得到登录后返回的代表SESSION ID的COOKIE。  

2 、将此COOKIE包含在一个cookiecontainer中并附加到另一个HTTPREQUEST请求中,则可以实现SESSION的还原。

using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
//using System.Data.OleDb;

using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Data.Odbc;
namespace PdfTest
{
/// <summary>

/// Summary description for WebForm1.

/// </summary>

public class getHttpInfo : System.Web.UI.Page
{
protected static string cookieheader;
private void Page_Load(object sender, System.EventArgs e)
{
// Put user code to initialize the page here

string strResult;
if (HttpContext.Current.Application["cookieheader"] != null)
{
cookieheader = (string) HttpContext.Current.Application["cookieheader"];
}
else
{
//Login into the website and keep the cookie for the session in the application variable

string strLogin = Login("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&USERID=&Password=");

}
strResult = getPage("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&data=");

//Write the result to htm file

FileStream htmFile = new FileStream(@"c:\save.htm", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(htmFile);
sw.Write(strResult);
sw.Close();
htmFile.Close();
// output the result

Response.Write(strResult);
}
public static string Login(String url, String paramList)
{
HttpWebResponse res = null;
string strResult = "";
try
{
HttpWebRequest req = (HttpWebRequest) WebRequest.Create(url);
req.Method = "POST";
req.ContentType = "application/x-www-form-urlencoded";
req.AllowAutoRedirect = false;
CookieContainer cookieCon = new CookieContainer();
req.CookieContainer = cookieCon;
StringBuilder UrlEncoded = new StringBuilder();
Char[] reserved = { '?', '=', '&' };
byte[] SomeBytes = null;
if (paramList != null)
{
int i = 0, j;
while (i < paramList.Length)
{
j = paramList.IndexOfAny(reserved, i);
if (j == -1)
{
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length - i)));
break;
}
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j - i)));
UrlEncoded.Append(paramList.Substring(j, 1));
i = j + 1;
}
SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString());
req.ContentLength = SomeBytes.Length;
Stream newStream = req.GetRequestStream();
newStream.Write(SomeBytes, 0, SomeBytes.Length);
newStream.Close();
}
else
{
req.ContentLength = 0;
}
res = (HttpWebResponse) req.GetResponse();
cookieheader = req.CookieContainer.GetCookieHeader(new Uri(url));
HttpContext.Current.Application.Lock();
HttpContext.Current.Application["cookieheader"] = cookieheader;
HttpContext.Current.Application.UnLock();
Stream ReceiveStream = res.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
StreamReader sr = new StreamReader(ReceiveStream, encode);
Char[] read = new Char[256];
int count = sr.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
strResult += str;
count = sr.Read(read, 0, 256);
}
}
catch (Exception e)
{
strResult = e.ToString();
}
finally
{
if (res != null)
{
res.Close();
}
}
return strResult;
}
public static string getPage(String url, String paramList)
{
HttpWebResponse res = null;
string strResult = "";
try
{
HttpWebRequest req = (HttpWebRequest) WebRequest.Create(url);
req.Method = "POST";
req.KeepAlive = true;
req.ContentType = "application/x-www-form-urlencoded";
CookieContainer cookieCon = new CookieContainer();
req.CookieContainer = cookieCon;
req.CookieContainer.SetCookies(new Uri(url), cookieheader);
StringBuilder UrlEncoded = new StringBuilder();
Char[] reserved = { '?', '=', '&' };
byte[] SomeBytes = null;
if (paramList != null)
{
int i = 0, j;
while (i < paramList.Length)
{
j = paramList.IndexOfAny(reserved, i);
if (j == -1)
{
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length - i)));
break;
}
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j - i)));
UrlEncoded.Append(paramList.Substring(j, 1));
i = j + 1;
}
SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString());
req.ContentLength = SomeBytes.Length;
Stream newStream = req.GetRequestStream();
newStream.Write(SomeBytes, 0, SomeBytes.Length);
newStream.Close();
}
else
{
req.ContentLength = 0;
}
res = (HttpWebResponse) req.GetResponse();
Stream ReceiveStream = res.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
StreamReader sr = new StreamReader(ReceiveStream, encode);
Char[] read = new Char[256];
int count = sr.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
strResult += str;
count = sr.Read(read, 0, 256);
}
}
catch (Exception e)
{
strResult = e.ToString();
}
finally
{
if (res != null)
{
res.Close();
}
}
return strResult;
}

#region Web Form Designer generated code

override protected void OnInit(EventArgs e)
{
//

// CODEGEN: This call is required by the ASP.NET Web Form Designer.

//

InitializeComponent();
base.OnInit(e);
}
/// <summary>

/// Required method for Designer support - do not modify

/// the contents of this method with the code editor.

/// </summary>

private void InitializeComponent()
{
this.Load += new System.EventHandler(this.Page_Load);
}
        #endregion

}
}

posted on 2008-09-05 15:36  jannock  阅读(2740)  评论(3编辑  收藏  举报