blackcore

本质的东西,深植于骨骼,扎根于灵魂! 淘实惠,各类电子版书籍

导航

HttpWebRequest 下载网页Html代码 下载文件(Remote和FTP)Get方式

在.net中可以使用XmlHttp,WebClient,HttpWebRequest等方式下载网页html源码。

使用XmlHttp需要引用Microsoft.Xml,在使用HttpWebRequest时,如果网站使用了反爬虫技术时,则需要为此模拟一个浏览器的环境访问,才能返回相应的html源码,否则将会是空,如下所示:

例如:某电子商务网站中有站内搜索

查看源码或浏览器上的URL

这样就可以使用GET直接请求。

public class WebPageUtil
{
//部分网站做了反爬虫技术时,需要模拟浏览器进行返回才能获取到相应的数据,否则获取不了
private static CookieContainer cookie = new CookieContainer();
private static string contentType = "application/x-www-form-urlencoded;";
private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";

/// <summary>
/// 返回请求的URL地址Tuple&lt;bool,string,string&gt; = 是否成功,网页源码,异常信息
/// </summary>
/// <param name="url"></param>
/// <param name="keyword"></param>
/// <param name="encoding"></param>
/// <param name="newUrl"></param>
/// <returns></returns>
public static Tuple<bool, string, string> GetHtmlSourceCode(string url, string keyword, Encoding encoding, out string newUrl)
{
bool methodStatus = false;
string pageHtml = "", exceptionInfo = "";
Tuple
<bool, string, string> executeResult = new Tuple<bool, string, string>(methodStatus, pageHtml, exceptionInfo);
//编码
newUrl = url + System.Uri.EscapeUriString(keyword);
//不编码:网站如果支持就可以不编码
//newUrl = url + keyword;

WebResponse response
= null;
HttpWebRequest request
= null;
Stream responseStream
= null;
StreamReader reader
= null;
try
{
request
= (HttpWebRequest)WebRequest.Create(newUrl);
request.UserAgent
= userAgent;
request.ContentType
= contentType;
request.CookieContainer
= cookie;
request.Accept
= accept;
request.Method
= "GET";
request.Timeout
= 30 * 1000;
//request.Host = "www.suning.com";
//request.UserAgent = "User-Agent Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1";

response
= request.GetResponse();
responseStream
= response.GetResponseStream();
reader
= new StreamReader(responseStream, encoding);
pageHtml
= reader.ReadToEnd();
methodStatus
= true;
}
catch (System.Net.WebException err)
{
exceptionInfo
= err.Message;
}
catch (Exception err)
{
exceptionInfo
= err.Message;
}
finally
{
if (reader != null) reader.Close();
if (responseStream != null) responseStream.Close();
if (response != null) response.Close();
if (request != null) request = null;
}

return Tuple.Create<bool, string, string>(methodStatus, pageHtml, exceptionInfo);
}

调用时,直接将url和关键词组合后以GET方式就可以获取。

下载方法使用指定url地址通过的URI从远程服务器下载数据到本地应用程序.
1. 获得远程服务器url地址;
2. 获得目标文件路径;
3. 使用WebRequest对象检查文件是否存在于服务器端 (导入命名空间System.Net的引用);
4. HTTP:创建WebClient(System.Net,类似于上面提到的UploadFile方法)实例,   访问你DownloadData() 方法通过指定URI下载文件缓冲资源到本地路径。实际上,对于HTTP资源,使用"GET"方法.

   FTP:创建FtpWebRequest实例,通过使用WebRequestMethods.Ftp.DownloadFile方法,我们可以接受来自服务器的资源流,此方法使用"RETR"命令下载FTP资源;
5. DownloadData方法会返回下载资源的字节数组,我们只需要从这个下载文件缓冲使用FileStream(using System.IO)写一个字节段到本地服务器路径;
6. 最后关闭并释放FileStream资源。

可参阅:WebClient 以及WebClient.DownloadData

RemoteDownload

public abstract class RemoteDownload
{

public string UrlString{get;set;}
public string DestDir{get;set;}
public RemoteDownload(string urlString, string destDir)
{
this.UrlString = urlString;
this.DestDir = destDir;
}

///<summary>
///从远程服务器下载文件
///</summary>
public virtual bool DownloadFile()
{
return true;
}
}

/// <summary>
/// HttpRemoteDownload 类
/// </summary>
public class HttpRemoteDownload : RemoteDownload
{
public HttpRemoteDownload(string urlString, string descFilePath)
:
base(urlString, descFilePath)
{

}

public override bool DownloadFile()
{
string fileName = System.IO.Path.GetFileName(this.UrlString);
string descFilePath =
System.IO.Path.Combine(
this.DestDir, fileName);
try
{
WebRequest myre
= WebRequest.Create(this.UrlString);
}
catch(Exception ex)
{
throw new Exception("服务器上不存在对应文件", ex.InnerException);
}
try
{
byte[] fileData;
using (WebClient client = new WebClient())
{
fileData
= client.DownloadData(this.UrlString);
}
using (FileStream fs =
new FileStream(descFilePath, FileMode.OpenOrCreate))
{
fs.Write(fileData,
0, fileData.Length);
}
return true;
}
catch (Exception ex)
{
throw new Exception("下载失败", ex.InnerException);
}
}
}

FTPDownload

/// <summary>
/// FtpDownload 类
/// </summary>
public class FtpRemoteDownload : RemoteDownload
{
public FtpRemoteDownload(string urlString, string descFilePath)
:
base(urlString, descFilePath)
{

}

public override bool DownloadFile()
{
FtpWebRequest reqFTP;

string fileName = System.IO.Path.GetFileName(this.UrlString);
string descFilePath =
System.IO.Path.Combine(
this.DestDir, fileName);

try
{

reqFTP
= (FtpWebRequest)FtpWebRequest.Create(this.UrlString);
reqFTP.Method
= WebRequestMethods.Ftp.DownloadFile;
reqFTP.UseBinary
= true;

using (FileStream outputStream = new FileStream(descFilePath, FileMode.OpenOrCreate))
{
using (FtpWebResponse response = (FtpWebResponse)reqFTP.GetResponse())
{
using (Stream ftpStream = response.GetResponseStream())
{
int bufferSize = 2048;
int readCount;
byte[] buffer = new byte[bufferSize];
readCount
= ftpStream.Read(buffer, 0, bufferSize);
while (readCount > 0)
{
outputStream.Write(buffer,
0, readCount);
readCount
= ftpStream.Read(buffer, 0, bufferSize);
}
}
}

}
return true;
}

catch (Exception ex)
{
throw new Exception("下载失败", ex.InnerException);
}
}

posted on 2011-05-28 21:42  blackcore  阅读(3895)  评论(0编辑  收藏  举报