1.先来个异步的方式,获取网页源码:
2.再来个同步的方式获取源码:
Code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
namespace TestSolution.BLL
{
public class MyHttpPage
{
public static ManualResetEvent allDone = new ManualResetEvent(false);
const int BUFFER_SIZE = 1024;
const int DefaultTimeout = 2 * 60 * 1000; // 2 minutes timeout
// Abort the request if the timer fires.
private static void TimeoutCallback(object state, bool timedOut)
{
if (timedOut)
{
HttpWebRequest request = state as HttpWebRequest;
if (request != null)
{
request.Abort();
}
}
}
public static string GetPage(string url)
{
try
{
// Create a HttpWebrequest object to the desired URL.
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
//myHttpWebRequest.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; Alexa Toolbar)";
/**
* If you are behind a firewall and you do not have your browser proxy setup
* you need to use the following proxy creation code.
// Create a proxy object.
WebProxy myProxy = new WebProxy();
// Associate a new Uri object to the _wProxy object, using the proxy address
// selected by the user.
myProxy.Address = new Uri("http://myproxy");
// Finally, initialize the Web request object proxy property with the _wProxy
// object.
myHttpWebRequest.Proxy=myProxy;
***/
// Create an instance of the RequestState and assign the previous myHttpWebRequest
// object to its request field.
RequestState myRequestState = new RequestState();
myRequestState.request = myHttpWebRequest;
// Start the asynchronous request.
IAsyncResult result =
(IAsyncResult)myHttpWebRequest.BeginGetResponse(new AsyncCallback(RespCallback), myRequestState);
// this line implements the timeout, if there is a timeout, the callback fires and the request becomes aborted
ThreadPool.RegisterWaitForSingleObject(result.AsyncWaitHandle, new WaitOrTimerCallback(TimeoutCallback), myHttpWebRequest, DefaultTimeout, true);
// The response came in the allowed time. The work processing will happen in the
// callback function.
allDone.WaitOne();
allDone.Reset();
//返回数据
return myRequestState.requestData.ToString();
// Release the HttpWebResponse resource.
//myRequestState.response.Close();
}
catch (WebException e)
{
Console.WriteLine("\nMain Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
Console.WriteLine("Press any key to continue.");
return "";
}
catch (Exception e)
{
Console.WriteLine("\nMain Exception raised!");
Console.WriteLine("Source :{0} ", e.Source);
Console.WriteLine("Message :{0} ", e.Message);
Console.WriteLine("Press any key to continue.");
Console.Read();
return "";
}
}
private static void RespCallback(IAsyncResult asynchronousResult)
{
try
{
// State of request is asynchronous.
RequestState myRequestState = (RequestState)asynchronousResult.AsyncState;
HttpWebRequest myHttpWebRequest = myRequestState.request;
myRequestState.response = (HttpWebResponse)myHttpWebRequest.EndGetResponse(asynchronousResult);
// Read the response into a Stream object.
Stream responseStream = myRequestState.response.GetResponseStream();
myRequestState.streamResponse = responseStream;
// Begin the Reading of the contents of the HTML page and print it to the console.
IAsyncResult asynchronousInputRead = responseStream.BeginRead(myRequestState.BufferRead, 0, BUFFER_SIZE, new AsyncCallback(ReadCallBack), myRequestState);
return;
}
catch (WebException e)
{
Console.WriteLine("\nRespCallback Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
}
allDone.Set();
}
private static void ReadCallBack(IAsyncResult asyncResult)
{
try
{
RequestState myRequestState = (RequestState)asyncResult.AsyncState;
Stream responseStream = myRequestState.streamResponse;
int read = responseStream.EndRead(asyncResult);
//获取编码类型
string CharacterSet = myRequestState.response.CharacterSet.ToLower();
string ContentType = myRequestState.response.ContentType;
string encodeout = "gb2312";//默认编码,获取文档使用的编码方式
//if (CharacterSet == "utf-8" || CharacterSet == "iso-8859-1" || ContentType.IndexOf("utf-8") > -1)
if (CharacterSet == "utf-8" || ContentType.IndexOf("utf-8") > -1)
{
encodeout = "utf-8";//utf-8 方式保存
//this.outString = "ContentEncoding=" + response.ContentEncoding + "\r\n CharacterSet=" + response.CharacterSet + "\r\n ContentType=" + response.ContentType;
}
string patternEncode = @"(?<=<\s*meta[^>]+charset\s*=\s*)utf-8";
Regex regEncode = new Regex(patternEncode, RegexOptions.IgnoreCase);
Match match = regEncode.Match(Encoding.ASCII.GetString(myRequestState.BufferRead, 0, read));
if (match.Success)
{
encodeout = "utf-8";//utf-8 方式保存
}
// Read the HTML page and then print it to the console.
if (read > 0)
{
//以正确的编码方式输出
//System.Text.Encoding.GetEncoding(encodeout)
//myRequestState.requestData.Append(Encoding.ASCII.GetString(myRequestState.BufferRead, 0, read));
myRequestState.requestData.Append(Encoding.GetEncoding(encodeout).GetString(myRequestState.BufferRead, 0, read));
IAsyncResult asynchronousResult = responseStream.BeginRead(myRequestState.BufferRead, 0, BUFFER_SIZE, new AsyncCallback(ReadCallBack), myRequestState);
return;
}
else
{
Console.WriteLine("\nThe contents of the Html page are : ");
if (myRequestState.requestData.Length > 1)
{
//返回HTML数据
string stringContent;
stringContent = myRequestState.requestData.ToString();
Console.WriteLine(stringContent);
}
Console.WriteLine("Press any key to continue.");
Console.ReadLine();
responseStream.Close();
}
}
catch (WebException e)
{
Console.WriteLine("\nReadCallBack Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
}
allDone.Set();
}
}
public class RequestState
{
// This class stores the State of the request.
const int BUFFER_SIZE = 1024;
public StringBuilder requestData;
public byte[] BufferRead;
public HttpWebRequest request;
public HttpWebResponse response;
public Stream streamResponse;
public RequestState()
{
BufferRead = new byte[BUFFER_SIZE];
requestData = new StringBuilder("");
request = null;
streamResponse = null;
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
namespace TestSolution.BLL
{
public class MyHttpPage
{
public static ManualResetEvent allDone = new ManualResetEvent(false);
const int BUFFER_SIZE = 1024;
const int DefaultTimeout = 2 * 60 * 1000; // 2 minutes timeout
// Abort the request if the timer fires.
private static void TimeoutCallback(object state, bool timedOut)
{
if (timedOut)
{
HttpWebRequest request = state as HttpWebRequest;
if (request != null)
{
request.Abort();
}
}
}
public static string GetPage(string url)
{
try
{
// Create a HttpWebrequest object to the desired URL.
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
//myHttpWebRequest.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; Alexa Toolbar)";
/**
* If you are behind a firewall and you do not have your browser proxy setup
* you need to use the following proxy creation code.
// Create a proxy object.
WebProxy myProxy = new WebProxy();
// Associate a new Uri object to the _wProxy object, using the proxy address
// selected by the user.
myProxy.Address = new Uri("http://myproxy");
// Finally, initialize the Web request object proxy property with the _wProxy
// object.
myHttpWebRequest.Proxy=myProxy;
***/
// Create an instance of the RequestState and assign the previous myHttpWebRequest
// object to its request field.
RequestState myRequestState = new RequestState();
myRequestState.request = myHttpWebRequest;
// Start the asynchronous request.
IAsyncResult result =
(IAsyncResult)myHttpWebRequest.BeginGetResponse(new AsyncCallback(RespCallback), myRequestState);
// this line implements the timeout, if there is a timeout, the callback fires and the request becomes aborted
ThreadPool.RegisterWaitForSingleObject(result.AsyncWaitHandle, new WaitOrTimerCallback(TimeoutCallback), myHttpWebRequest, DefaultTimeout, true);
// The response came in the allowed time. The work processing will happen in the
// callback function.
allDone.WaitOne();
allDone.Reset();
//返回数据
return myRequestState.requestData.ToString();
// Release the HttpWebResponse resource.
//myRequestState.response.Close();
}
catch (WebException e)
{
Console.WriteLine("\nMain Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
Console.WriteLine("Press any key to continue.");
return "";
}
catch (Exception e)
{
Console.WriteLine("\nMain Exception raised!");
Console.WriteLine("Source :{0} ", e.Source);
Console.WriteLine("Message :{0} ", e.Message);
Console.WriteLine("Press any key to continue.");
Console.Read();
return "";
}
}
private static void RespCallback(IAsyncResult asynchronousResult)
{
try
{
// State of request is asynchronous.
RequestState myRequestState = (RequestState)asynchronousResult.AsyncState;
HttpWebRequest myHttpWebRequest = myRequestState.request;
myRequestState.response = (HttpWebResponse)myHttpWebRequest.EndGetResponse(asynchronousResult);
// Read the response into a Stream object.
Stream responseStream = myRequestState.response.GetResponseStream();
myRequestState.streamResponse = responseStream;
// Begin the Reading of the contents of the HTML page and print it to the console.
IAsyncResult asynchronousInputRead = responseStream.BeginRead(myRequestState.BufferRead, 0, BUFFER_SIZE, new AsyncCallback(ReadCallBack), myRequestState);
return;
}
catch (WebException e)
{
Console.WriteLine("\nRespCallback Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
}
allDone.Set();
}
private static void ReadCallBack(IAsyncResult asyncResult)
{
try
{
RequestState myRequestState = (RequestState)asyncResult.AsyncState;
Stream responseStream = myRequestState.streamResponse;
int read = responseStream.EndRead(asyncResult);
//获取编码类型
string CharacterSet = myRequestState.response.CharacterSet.ToLower();
string ContentType = myRequestState.response.ContentType;
string encodeout = "gb2312";//默认编码,获取文档使用的编码方式
//if (CharacterSet == "utf-8" || CharacterSet == "iso-8859-1" || ContentType.IndexOf("utf-8") > -1)
if (CharacterSet == "utf-8" || ContentType.IndexOf("utf-8") > -1)
{
encodeout = "utf-8";//utf-8 方式保存
//this.outString = "ContentEncoding=" + response.ContentEncoding + "\r\n CharacterSet=" + response.CharacterSet + "\r\n ContentType=" + response.ContentType;
}
string patternEncode = @"(?<=<\s*meta[^>]+charset\s*=\s*)utf-8";
Regex regEncode = new Regex(patternEncode, RegexOptions.IgnoreCase);
Match match = regEncode.Match(Encoding.ASCII.GetString(myRequestState.BufferRead, 0, read));
if (match.Success)
{
encodeout = "utf-8";//utf-8 方式保存
}
// Read the HTML page and then print it to the console.
if (read > 0)
{
//以正确的编码方式输出
//System.Text.Encoding.GetEncoding(encodeout)
//myRequestState.requestData.Append(Encoding.ASCII.GetString(myRequestState.BufferRead, 0, read));
myRequestState.requestData.Append(Encoding.GetEncoding(encodeout).GetString(myRequestState.BufferRead, 0, read));
IAsyncResult asynchronousResult = responseStream.BeginRead(myRequestState.BufferRead, 0, BUFFER_SIZE, new AsyncCallback(ReadCallBack), myRequestState);
return;
}
else
{
Console.WriteLine("\nThe contents of the Html page are : ");
if (myRequestState.requestData.Length > 1)
{
//返回HTML数据
string stringContent;
stringContent = myRequestState.requestData.ToString();
Console.WriteLine(stringContent);
}
Console.WriteLine("Press any key to continue.");
Console.ReadLine();
responseStream.Close();
}
}
catch (WebException e)
{
Console.WriteLine("\nReadCallBack Exception raised!");
Console.WriteLine("\nMessage:{0}", e.Message);
Console.WriteLine("\nStatus:{0}", e.Status);
}
allDone.Set();
}
}
public class RequestState
{
// This class stores the State of the request.
const int BUFFER_SIZE = 1024;
public StringBuilder requestData;
public byte[] BufferRead;
public HttpWebRequest request;
public HttpWebResponse response;
public Stream streamResponse;
public RequestState()
{
BufferRead = new byte[BUFFER_SIZE];
requestData = new StringBuilder("");
request = null;
streamResponse = null;
}
}
}
2.再来个同步的方式获取源码:
Code
/***************************************************************************************************
* Filename: GetPageHTML.cs
* Module: 获取URL的源代码,保存到本地
* Copyright: 2008 杰世科技 版权所有
* Author: 周林郁
* Created Date: 2008-10-20
* Last Modified Data:
* Description:
***************************************************************************************************/
using System;
using System.Collections.Generic;
//using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
namespace MonitorService
{
public class GetPageHTML
{
// 日志操作类
WriteLog wl = new WriteLog();
/// <summary>
/// 静态生成页面的方法
/// </summary>
/// <param name="strPageUrl">生成源</param>
/// <param name="strFileName">生成到</param>
public string GetPageCode(string strPageUrl, string strOutFilePath)
{
string PageHtml = "";
//获得aspx的静态html
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strPageUrl);
//Set the 'Timeout' property in Milliseconds.微秒
request.Timeout = 180000;//设置3分钟超时,1秒=1000微秒
// Set some reasonable limits on resources used by this request
//request.MaximumAutomaticRedirections = 4;
//request.MaximumResponseHeadersLength = 4;
// Set credentials to use for this request.
//request.Credentials = CredentialCache.DefaultCredentials;
//HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//判断是否有错误页面,参考:http://blog.joycode.com/ghj/archive/2008/03/12/114957.aspx
HttpWebResponse response;
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)ex.Response;
}
#region HttpStatusCode 枚举值参考
/*** HttpStatusCode *************************************************/
//Continue = 100,
//SwitchingProtocols = 101,
//OK = 200,
//Created = 201,
//Accepted = 202,
//NonAuthoritativeInformation = 203,
//NoContent = 204,
//ResetContent = 205,
//PartialContent = 206,
//MultipleChoices = 300,
//Ambiguous = 300,
//MovedPermanently = 301,
//Moved = 301,
//Found = 302,
//Redirect = 302,
//SeeOther = 303,
//RedirectMethod = 303,
//NotModified = 304,
//UseProxy = 305,
//Unused = 306,
//TemporaryRedirect = 307,
//RedirectKeepVerb = 307,
//BadRequest = 400,
//Unauthorized = 401,
//PaymentRequired = 402,
//Forbidden = 403,
//NotFound = 404,
//MethodNotAllowed = 405,
//NotAcceptable = 406,
//ProxyAuthenticationRequired = 407,
//RequestTimeout = 408,
//Conflict = 409,
//Gone = 410,
//LengthRequired = 411,
//PreconditionFailed = 412,
//RequestEntityTooLarge = 413,
//RequestUriTooLong = 414,
//UnsupportedMediaType = 415,
//RequestedRangeNotSatisfiable = 416,
//ExpectationFailed = 417,
//InternalServerError = 500,
//NotImplemented = 501,
//BadGateway = 502,
//ServiceUnavailable = 503,
//GatewayTimeout = 504,
//HttpVersionNotSupported = 505,
/*** HttpStatusCode *************************************************/
//switch (response.StatusCode)
//{
// case HttpStatusCode.OK:
// receiveStream = response.GetResponseStream();
// break;
// case HttpStatusCode.Moved:
// string s = response.StatusDescription;
// //receiveStream = response.GetResponseStream();
// break;
// case HttpStatusCode.Redirect:
// break;
// default:
// break;
//}
#endregion
//如果文档出错,直接返回
if (response.StatusCode != HttpStatusCode.OK)
{
response.Close();
return "";
}
//Get the stream associated with the response.
//Stream receiveStream = response.GetResponseStream();
#region 开始读取内容
// Pipes the stream to a higher level stream reader with the required encoding format.
//获取编码方式
string CharacterSet = response.CharacterSet.ToLower();
string encodeout = "gb2312";//默认编码,获取文档使用的编码方式
if (CharacterSet == "utf-8" || CharacterSet == "iso-8859-1" || response.ContentType.IndexOf("utf-8") > -1)
{
encodeout = "utf-8";//utf-8 方式保存
//this.outString = "ContentEncoding=" + response.ContentEncoding + "\r\n CharacterSet=" + response.CharacterSet + "\r\n ContentType=" + response.ContentType;
}
//Encoding encode = System.Text.Encoding.Default;//encode=DBCSCodePageEncoding
//System.Text.Encoding.GetEncoding(encodeout)
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = new StreamReader(receiveStream, System.Text.Encoding.GetEncoding(encodeout));//原来是:System.Text.Encoding.UTF8
PageHtml = readStream.ReadToEnd().ToString();
//wl.WriteLogMsg("GetPageHTML = " + PageHtml + "\r\n");
if (strOutFilePath != "" && PageHtml !="" )
{
// 保存
PageHtml = PageHtml.Replace("<BR>", "\r\n");
string _saveFilePath = System.AppDomain.CurrentDomain.BaseDirectory + strOutFilePath;
WriteTextFile(_saveFilePath, PageHtml);
}
#endregion
//判断文档下载是否完成,文档是否正确,readStream.EndOfStream ,response.StatusCode
//PageHtml.IndexOf("</html>") > -1 || PageHtml.IndexOf("</HTML>") > -1
#region 释放资源
//释放所有资源
readStream.Close();
readStream.Dispose();
receiveStream.Close();
receiveStream.Dispose();
response.Close();
#endregion
return PageHtml;
}
catch
{
//this.noteMessage = "出错了,请检查网络是否连通;";
return PageHtml;
}
}
/// <summary> CutX 截取一段文本,包括首尾字符串(包括首尾字符串)
/// '**函数功能:截取一段文本包括首尾字符串(包括首尾字符串)
/// '**函数参数:返回截取的文本段
/// ' s --要截取的源代码
/// ' strhead --截取的起始标志
/// ' strbot --截取的结束标志
/// </summary>
public string CutX(string s, string strHead, string strBot)
{
int iBegin = s.IndexOf(strHead);
int iEnd = s.IndexOf(strBot, iBegin + 1);
if (iBegin > 0 && iEnd > 0 && iEnd > iBegin)
{
return s.Substring(iBegin, iEnd - iBegin + strBot.Length);
}
else
{
return "";
}
}
/// <summary> Cut截取一段文本,不包括首尾字符串
/// '**函数功能:截取一段文本不包括首尾字符串
/// '**函数参数:返回截取的文本段
/// ' s --要截取的源代码
/// ' strhead --截取的起始标志
/// ' strbot --截取的结束标志
/// </summary>
public string Cut(string s, string strHead, string strBot)
{
int iBegin = s.IndexOf(strHead) + strHead.Length;
int iEnd = s.IndexOf(strBot, iBegin + 1);
if (iBegin > 0 && iEnd > 0 && iEnd > iBegin)
{
return s.Substring(iBegin, iEnd - iBegin);
}
else
{
return "";
}
}
/// <summary> FilterHTML(string strHTML),过滤所有HTML代码
/// 输入HTML代码,返回过滤后的内容
/// </summary>
public string FilterHTML(string strHTML)
{
string TempHtml = strHTML;
//TempHtml= System.Text.RegularExpressions.Regex.Replace(strHTML, @"<[^>]*>", "");
TempHtml = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<[^>]+>", "");
return TempHtml;
}
/// <summary> FilterScript(string strHTML),过滤所有脚本代码
/// 输入HTML代码,返回过滤后的内容
/// 过滤:<script,<iframe,<NOSCRIPT,<object,<--注析的内容
/// </summary>
public string FilterScript(string strHTML)
{
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T).+?\/(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<SCRIPT([^>])*>(\\w|\\W)*</SCRIPT([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<NOSCRIPT([^>])*>(\\w|\\W)*</NOSCRIPT([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<script([^>])*>(\\w|\\W)*</script([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<noscript([^>])*>(\\w|\\W)*</noscript([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<object([^>])*>(\\w|\\W)*</object([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<(i|I)(f|F)(r|R)(a|A)(m|M)(e|E).+?\/(i|I)(f|F)(r|R)(a|A)(m|M)(e|E)>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<iframe([^>])*>+?\</iframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<iframe([^>])*>(\\w|\\W)*</iframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<noiframe([^>])*>(\\w|\\W)*</noiframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<IFRAME([^>])*>+?\</IFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<IFRAME([^>])*>(\\w|\\W)*</IFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<NOIFRAME([^>])*>(\\w|\\W)*</NOIFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<!-([^>])*>(\\w|\\W)*--([^>])*>", "");
return strHTML;
}
#region WriteTextFile 写入文本文件
/// <summary>
/// 将数据写入到文本文件
/// </summary>
/// <param name="strFilePath">写入到的位置</param>
/// <param name="strData">要写入的数据</param>
/// <returns>如果成功写入,返回True</returns>
public bool WriteTextFile(string strFilePath, string strData)
{
try
{
System.IO.StreamWriter sw = new System.IO.StreamWriter(strFilePath, false, System.Text.Encoding.Default);
sw.Write(strData);
sw.Close();
return true;
}
catch (Exception e)
{
throw (e);
}
}
#endregion
}
}
/***************************************************************************************************
* Filename: GetPageHTML.cs
* Module: 获取URL的源代码,保存到本地
* Copyright: 2008 杰世科技 版权所有
* Author: 周林郁
* Created Date: 2008-10-20
* Last Modified Data:
* Description:
***************************************************************************************************/
using System;
using System.Collections.Generic;
//using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
namespace MonitorService
{
public class GetPageHTML
{
// 日志操作类
WriteLog wl = new WriteLog();
/// <summary>
/// 静态生成页面的方法
/// </summary>
/// <param name="strPageUrl">生成源</param>
/// <param name="strFileName">生成到</param>
public string GetPageCode(string strPageUrl, string strOutFilePath)
{
string PageHtml = "";
//获得aspx的静态html
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strPageUrl);
//Set the 'Timeout' property in Milliseconds.微秒
request.Timeout = 180000;//设置3分钟超时,1秒=1000微秒
// Set some reasonable limits on resources used by this request
//request.MaximumAutomaticRedirections = 4;
//request.MaximumResponseHeadersLength = 4;
// Set credentials to use for this request.
//request.Credentials = CredentialCache.DefaultCredentials;
//HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//判断是否有错误页面,参考:http://blog.joycode.com/ghj/archive/2008/03/12/114957.aspx
HttpWebResponse response;
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)ex.Response;
}
#region HttpStatusCode 枚举值参考
/*** HttpStatusCode *************************************************/
//Continue = 100,
//SwitchingProtocols = 101,
//OK = 200,
//Created = 201,
//Accepted = 202,
//NonAuthoritativeInformation = 203,
//NoContent = 204,
//ResetContent = 205,
//PartialContent = 206,
//MultipleChoices = 300,
//Ambiguous = 300,
//MovedPermanently = 301,
//Moved = 301,
//Found = 302,
//Redirect = 302,
//SeeOther = 303,
//RedirectMethod = 303,
//NotModified = 304,
//UseProxy = 305,
//Unused = 306,
//TemporaryRedirect = 307,
//RedirectKeepVerb = 307,
//BadRequest = 400,
//Unauthorized = 401,
//PaymentRequired = 402,
//Forbidden = 403,
//NotFound = 404,
//MethodNotAllowed = 405,
//NotAcceptable = 406,
//ProxyAuthenticationRequired = 407,
//RequestTimeout = 408,
//Conflict = 409,
//Gone = 410,
//LengthRequired = 411,
//PreconditionFailed = 412,
//RequestEntityTooLarge = 413,
//RequestUriTooLong = 414,
//UnsupportedMediaType = 415,
//RequestedRangeNotSatisfiable = 416,
//ExpectationFailed = 417,
//InternalServerError = 500,
//NotImplemented = 501,
//BadGateway = 502,
//ServiceUnavailable = 503,
//GatewayTimeout = 504,
//HttpVersionNotSupported = 505,
/*** HttpStatusCode *************************************************/
//switch (response.StatusCode)
//{
// case HttpStatusCode.OK:
// receiveStream = response.GetResponseStream();
// break;
// case HttpStatusCode.Moved:
// string s = response.StatusDescription;
// //receiveStream = response.GetResponseStream();
// break;
// case HttpStatusCode.Redirect:
// break;
// default:
// break;
//}
#endregion
//如果文档出错,直接返回
if (response.StatusCode != HttpStatusCode.OK)
{
response.Close();
return "";
}
//Get the stream associated with the response.
//Stream receiveStream = response.GetResponseStream();
#region 开始读取内容
// Pipes the stream to a higher level stream reader with the required encoding format.
//获取编码方式
string CharacterSet = response.CharacterSet.ToLower();
string encodeout = "gb2312";//默认编码,获取文档使用的编码方式
if (CharacterSet == "utf-8" || CharacterSet == "iso-8859-1" || response.ContentType.IndexOf("utf-8") > -1)
{
encodeout = "utf-8";//utf-8 方式保存
//this.outString = "ContentEncoding=" + response.ContentEncoding + "\r\n CharacterSet=" + response.CharacterSet + "\r\n ContentType=" + response.ContentType;
}
//Encoding encode = System.Text.Encoding.Default;//encode=DBCSCodePageEncoding
//System.Text.Encoding.GetEncoding(encodeout)
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = new StreamReader(receiveStream, System.Text.Encoding.GetEncoding(encodeout));//原来是:System.Text.Encoding.UTF8
PageHtml = readStream.ReadToEnd().ToString();
//wl.WriteLogMsg("GetPageHTML = " + PageHtml + "\r\n");
if (strOutFilePath != "" && PageHtml !="" )
{
// 保存
PageHtml = PageHtml.Replace("<BR>", "\r\n");
string _saveFilePath = System.AppDomain.CurrentDomain.BaseDirectory + strOutFilePath;
WriteTextFile(_saveFilePath, PageHtml);
}
#endregion
//判断文档下载是否完成,文档是否正确,readStream.EndOfStream ,response.StatusCode
//PageHtml.IndexOf("</html>") > -1 || PageHtml.IndexOf("</HTML>") > -1
#region 释放资源
//释放所有资源
readStream.Close();
readStream.Dispose();
receiveStream.Close();
receiveStream.Dispose();
response.Close();
#endregion
return PageHtml;
}
catch
{
//this.noteMessage = "出错了,请检查网络是否连通;";
return PageHtml;
}
}
/// <summary> CutX 截取一段文本,包括首尾字符串(包括首尾字符串)
/// '**函数功能:截取一段文本包括首尾字符串(包括首尾字符串)
/// '**函数参数:返回截取的文本段
/// ' s --要截取的源代码
/// ' strhead --截取的起始标志
/// ' strbot --截取的结束标志
/// </summary>
public string CutX(string s, string strHead, string strBot)
{
int iBegin = s.IndexOf(strHead);
int iEnd = s.IndexOf(strBot, iBegin + 1);
if (iBegin > 0 && iEnd > 0 && iEnd > iBegin)
{
return s.Substring(iBegin, iEnd - iBegin + strBot.Length);
}
else
{
return "";
}
}
/// <summary> Cut截取一段文本,不包括首尾字符串
/// '**函数功能:截取一段文本不包括首尾字符串
/// '**函数参数:返回截取的文本段
/// ' s --要截取的源代码
/// ' strhead --截取的起始标志
/// ' strbot --截取的结束标志
/// </summary>
public string Cut(string s, string strHead, string strBot)
{
int iBegin = s.IndexOf(strHead) + strHead.Length;
int iEnd = s.IndexOf(strBot, iBegin + 1);
if (iBegin > 0 && iEnd > 0 && iEnd > iBegin)
{
return s.Substring(iBegin, iEnd - iBegin);
}
else
{
return "";
}
}
/// <summary> FilterHTML(string strHTML),过滤所有HTML代码
/// 输入HTML代码,返回过滤后的内容
/// </summary>
public string FilterHTML(string strHTML)
{
string TempHtml = strHTML;
//TempHtml= System.Text.RegularExpressions.Regex.Replace(strHTML, @"<[^>]*>", "");
TempHtml = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<[^>]+>", "");
return TempHtml;
}
/// <summary> FilterScript(string strHTML),过滤所有脚本代码
/// 输入HTML代码,返回过滤后的内容
/// 过滤:<script,<iframe,<NOSCRIPT,<object,<--注析的内容
/// </summary>
public string FilterScript(string strHTML)
{
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T).+?\/(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<SCRIPT([^>])*>(\\w|\\W)*</SCRIPT([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<NOSCRIPT([^>])*>(\\w|\\W)*</NOSCRIPT([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<script([^>])*>(\\w|\\W)*</script([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<noscript([^>])*>(\\w|\\W)*</noscript([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<object([^>])*>(\\w|\\W)*</object([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<(i|I)(f|F)(r|R)(a|A)(m|M)(e|E).+?\/(i|I)(f|F)(r|R)(a|A)(m|M)(e|E)>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<iframe([^>])*>+?\</iframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<iframe([^>])*>(\\w|\\W)*</iframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<noiframe([^>])*>(\\w|\\W)*</noiframe([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<IFRAME([^>])*>+?\</IFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<IFRAME([^>])*>(\\w|\\W)*</IFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<NOIFRAME([^>])*>(\\w|\\W)*</NOIFRAME([^>])*>", "");
strHTML = System.Text.RegularExpressions.Regex.Replace(strHTML, @"<!-([^>])*>(\\w|\\W)*--([^>])*>", "");
return strHTML;
}
#region WriteTextFile 写入文本文件
/// <summary>
/// 将数据写入到文本文件
/// </summary>
/// <param name="strFilePath">写入到的位置</param>
/// <param name="strData">要写入的数据</param>
/// <returns>如果成功写入,返回True</returns>
public bool WriteTextFile(string strFilePath, string strData)
{
try
{
System.IO.StreamWriter sw = new System.IO.StreamWriter(strFilePath, false, System.Text.Encoding.Default);
sw.Write(strData);
sw.Close();
return true;
}
catch (Exception e)
{
throw (e);
}
}
#endregion
}
}