c#简单实现提取网页内容
下面的代码是从一个网络爬虫程序中提取出来的,觉得有用,记录下来。
代码
using System;
using System.Collections;
using System.Collections.Generic;
using System.Reflection;
using System.IO;
using System.Net;
using System.Text;
namespace MyCsStudy
{
class Program
{
/// <summary>
/// 简单网络爬虫程序
/// </summary>
/// <param name="url"></param>
/// <param name="charset">编码 可以为空</param>
/// <returns></returns>
public static string Fetch(string url, string charset)
{
Encoding encoding;
HttpWebRequest request;
HttpWebResponse response = null;
Stream resStream = null;
StreamReader sr = null;
string result = string.Empty;
try
{
request = (HttpWebRequest)HttpWebRequest.Create(url);
response = (HttpWebResponse)request.GetResponse();
resStream = response.GetResponseStream();
if (!string.IsNullOrEmpty(charset))
{
encoding = Encoding.GetEncoding(charset);
}
else if (!string.IsNullOrEmpty(response.CharacterSet))
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
else
{
encoding = Encoding.Default;
}
sr = new StreamReader(resStream, encoding);
result = sr.ReadToEnd();
}
//catch (Exception ex)
//{
// throw ex;
//}
finally
{
if (sr != null)
{
sr.Close();
}
if (resStream != null)
{
resStream.Close();
}
if (response != null)
{
response.Close();
}
}
return result;
}
static void Main(string[] args)
{
string webSite=@"http://www.google.cn"; //这里url必须带上协议
string strHTML = Fetch(webSite,null);
Console.Write(strHTML);
Console.ReadLine();
}
}
}
using System.Collections;
using System.Collections.Generic;
using System.Reflection;
using System.IO;
using System.Net;
using System.Text;
namespace MyCsStudy
{
class Program
{
/// <summary>
/// 简单网络爬虫程序
/// </summary>
/// <param name="url"></param>
/// <param name="charset">编码 可以为空</param>
/// <returns></returns>
public static string Fetch(string url, string charset)
{
Encoding encoding;
HttpWebRequest request;
HttpWebResponse response = null;
Stream resStream = null;
StreamReader sr = null;
string result = string.Empty;
try
{
request = (HttpWebRequest)HttpWebRequest.Create(url);
response = (HttpWebResponse)request.GetResponse();
resStream = response.GetResponseStream();
if (!string.IsNullOrEmpty(charset))
{
encoding = Encoding.GetEncoding(charset);
}
else if (!string.IsNullOrEmpty(response.CharacterSet))
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
else
{
encoding = Encoding.Default;
}
sr = new StreamReader(resStream, encoding);
result = sr.ReadToEnd();
}
//catch (Exception ex)
//{
// throw ex;
//}
finally
{
if (sr != null)
{
sr.Close();
}
if (resStream != null)
{
resStream.Close();
}
if (response != null)
{
response.Close();
}
}
return result;
}
static void Main(string[] args)
{
string webSite=@"http://www.google.cn"; //这里url必须带上协议
string strHTML = Fetch(webSite,null);
Console.Write(strHTML);
Console.ReadLine();
}
}
}
作者:Jeff Wong
出处:http://jeffwongishandsome.cnblogs.com/
本文版权归作者和博客园共有,欢迎围观转载。转载时请您务必在文章明显位置给出原文链接,谢谢您的合作。