数据采集[即与 WEB 相关的功能函数]
--
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
namespace ToolLibrary
{
/// <summary>
/// 网络爬虫[数据采集] [即与 WEB 相关的功能函数]
/// [wzrong 2008-11-06]
/// QQ:120152169
/// Email:w_zrong@163.com
/// </summary>
public class WebCrawler
{
#region 根据网站地址(URL)获取整站的 HTML
/// <summary>
/// 根据网站地址(URL)获取整站的 HTML
/// </summary>
/// <param name="urlPath">网站地址(URL)</param>
/// <returns>整站的 HTML</returns>
public static string GetHtmlContentsByUrl(string urlPath)
{
string returnStr = string.Empty;
try
{
WebClient client = new WebClient(); //向URL标识的资源发送数据和从URL标识的资源接收数据
returnStr = client.DownloadString(urlPath);//以字符串的形式下载资源
client.Dispose();
}
catch
{
returnStr = "";
}
return returnStr;
}
#endregion
#region 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList
/// <summary>
/// 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList
/// </summary>
/// <param name="htmlSource">HTML源码</param>
/// <param name="strRegex">正则表达式(手动配置表达式)</param>
/// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
/// <returns>指定信息集合</returns>
public static ArrayList GetHtmlArrayByRegex(string htmlSource, string strRegex, bool isRightToLeft)
{
ArrayList array = new ArrayList();
Regex rex;
string html = htmlSource.Replace("\r\n", "").Replace("\r", "").Replace("\t", "");
if (isRightToLeft)
{
rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.RightToLeft);
}
else
{
rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
MatchCollection mc = rex.Matches(html); //迭代匹配 (在指定的字符串中搜索正则表达式的所有匹配)
foreach (Match m in mc)
{
string matchStr = m.Groups[1].ToString().Trim(); //获取由正则表达式匹配的组的集合
array.Add(matchStr);
}
return array;
}
#endregion
#region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList
/// <summary>
/// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList
/// </summary>
/// <param name="htmlSource">HTML源码</param>
/// <param name="startRex">起始标志</param>
/// <param name="endRex">结束标志</param>
/// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
/// <returns>指定信息集合</returns>
public static ArrayList GetHtmlArrayByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
{
string returnRex = startRex + "(.*?)" + endRex;
return GetHtmlArrayByRegex(htmlSource, returnRex, false);
}
#endregion
#region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String
/// <summary>
/// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String
/// </summary>
/// <param name="htmlSource">HTML源码</param>
/// <param name="startRex">起始标志</param>
/// <param name="endRex">结束标志</param>
/// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
/// <returns>指定信息的字符串</returns>
public static String GetHtmlStrByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
{
//string returnStr = string.Empty;
StringBuilder sb = new StringBuilder();
string regexStr = startRex + "(.*?)" + endRex;
ArrayList array = new ArrayList();
array = GetHtmlArrayByRegex(htmlSource, regexStr, false);
for (int i = 0; i < array.Count; i++)
{
//returnStr = array[i].ToString();
sb.Append(array[i].ToString());
}
//return returnStr;
return sb.ToString();
}
#endregion
#region 得到分页的连接地址
/// <summary>
/// 得到分页连接地址
/// </summary>
/// <param name="oldPageUrl">原连接地址</param>
/// <param name="PageTags">分页标签</param>
/// <returns>分页连接地址</returns>
public static String GetPageUrl(string oldPageUrl, string PageTags)
{
string newPageUrl = string.Empty;
return newPageUrl;
}
#endregion
#region 得到网页图片的地址[多个则用分割符隔开累加]
/// <summary>
/// 得到网页图片地址 [多个则用分割符隔开累加]
/// </summary>
/// <param name="html">包含图片的 HTML 代码</param>
/// <returns>图片地址</returns>
public static String GetHtmlImgUrl(string html)
{
string returnStr = "";
ArrayList array = new ArrayList();
array = GetHtmlArrayByRegex(html, "src=\"", "\"", false);
for (int i = 0; i < array.Count; i++)
{
if (i == 0)
{
returnStr = array[i].ToString();
}
else
{
returnStr = array[i].ToString() + Common.CommonConst.GAP_CHAR1 + returnStr;
}
}
return returnStr;
}
#endregion
#region 得到有效的连接地址 (对不包含域名的地址加上域名)
/// <summary>
/// 得到有效的连接地址 (对不包含域名的地址加上域名)
/// </summary>
/// <param name="oldUrl">原始地址</param>
/// <param name="domainUrl">域名地址 如http://www.baidu.com/ </param>
/// <returns>有效的连接地址</returns>
public static String GetValidUrl(string oldUrl, string domainUrl)
{
string newUrl = oldUrl;
string http = "http://";
if (!oldUrl.Contains(http))
{
if (oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
{
newUrl = domainUrl.Remove(domainUrl.Length - 1, 1) + oldUrl;
}
else if (!oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
{
newUrl = domainUrl + "/" + oldUrl;
}
else
{
newUrl = domainUrl + oldUrl;
}
}
return newUrl;
}
#endregion
#region 获取文件后缀和文件名称
/// <summary>
/// 获取文件后缀和文件名称
/// 如果文件字符串连同路径传值,则返回文件名也包含路径
/// </summary>
/// <param name="fileStr">文件字符串[名称和后缀(可以包含路径) 如:txtName.txt]</param>
/// <param name="splitChr">分割符 如.</param>
/// <param name="fileName">文件名</param>
/// <param name="suffix">后缀</param>
public static void GetFileNameAndSuffix(string fileStr, char splitChr, out string fileName, out string suffix)
{
if (fileStr.Trim() == string.Empty)
{
fileName = suffix = "";
return;
}
if (!fileStr.Contains(splitChr))
{
fileName = suffix = "";
return;
}
int index = fileStr.LastIndexOf(splitChr);
fileName = fileStr.Substring(0, index);
suffix = fileStr.Substring(index + 1);
}
#endregion
#region 获取网络图片命名名称和后缀 [如:命名名称.jpg]
/// <summary>
/// 获取网络图片命名名称和后缀 [如:命名名称.jpg]
/// </summary>
/// <param name="imgUrl">网络图片连接地址</param>
/// <param name="isOverWriteName">是否从写图片名称?true:false</param>
/// <returns>图片命名名称和后缀</returns>
public static string GetImgNameAndSuffix(string imgUrl, bool isOverWriteName)
{
//例如:/images/bg7.jpg
string imgName = "";
if (imgUrl.Contains("/"))
{
imgName = imgUrl.Substring(imgUrl.LastIndexOf("/") + 1);
}
else
{
imgName = imgUrl;
}
//重写图片名称
if (isOverWriteName)
{
string fileName, sufFix;
GetFileNameAndSuffix(imgUrl, '.', out fileName, out sufFix);
imgName = DateTime.Now.ToString("yyMMddhhmmss") + DateTime.Now.Millisecond.ToString() + "." + sufFix;
}
return imgName;
}
#endregion
#region 从网络上下载图片到本地服务器
/// <summary>
/// 从网络上下载图片到本地服务器
/// </summary>
/// <param name="imgUrl">网络图片的连接地址 </param>
/// <param name="imgSavePath">要接收数据的本地文件名称</param>
/// <param name="domainUrl">域名地址 如: http://www.baidu.com </param>
public static void DownLoadImgToLocal(string imgUrl, string imgSavePath)
{
try
{
WebClient client = new WebClient();
client.DownloadFile(imgUrl, imgSavePath);
client.Dispose();
}
catch {
}
}
#endregion
#region 重写显示图片的 HTML 代码 <img />
/// <summary>
/// 重写图片显示的HTML代码 返回格式:[img SRC="imgPath" ALT="imgTitle" /]
/// </summary>
/// <param name="imgOldHtml">原始IMG显示的HTML代码</param>
/// <param name="imgNewSavePath">图片存放新地址/路径</param>
/// <param name="imgTitle">图片标题</param>
/// <returns>返回格式:src="imgPath" alt="imgTitle"</returns>
public static string OverWriteImgUrlInHtml(string imgOldHtml, string imgNewSavePath, string imgTitle)
{
string returnStr = "";
string imgPath = ""; //img本地存放路径
string imgUrl = GetHtmlImgUrl(imgOldHtml); //img网络连接地址
string imgName = GetImgNameAndSuffix(imgUrl, false); // img名称
if (imgNewSavePath.EndsWith("/"))
{
imgPath = imgNewSavePath;
}
else
{
imgPath = imgNewSavePath + "/";
}
returnStr = "<img src=\"" + imgPath + imgName + "\" alt=\"" + imgTitle + "\">" + "<br/> ";
return returnStr;
}
#endregion
#region 将 HTML 中的图片地址替换成本地地址 并将其下载到本地服务器中 返回改写图片地址后的 HTML 文本
/// <summary>
/// 将 HTML 中的图片地址替换成本地地址
/// 并将其下载到本地服务器中
/// 返回改写图片地址后的 HTML 文本
/// </summary>
/// <param name="htmlSource">原始 HTML 文本串</param>
/// <param name="domainUrl">图片所在服务器域名地址</param>
/// <returns></returns>
public static string ReplaceImgDirInHtml(string htmlSource, string domainUrl)
{
string returnStr = htmlSource;
ArrayList array = new ArrayList();
array = GetHtmlArrayByRegex(htmlSource, "<img ", ">", false);
returnStr = returnStr.Replace("<img", "");
for (int i = 0; i < array.Count; i++)
{
//带HTML的图片地址
string imgOldHtml = array[i].ToString(); //src = "http://www.11kp.com/images/20070423/1234fg32.jpg"
//存放图片的文件夹路径 如: /images/news/20081107/03/
string imgSavePath = IOFunction.CreateFolder(Common.CommonConst.IMG_SAVE_DIR);
//原始图片地址 如: http://www.11kp.com/images/20070423/1234fg32.jpg
string imgOldUrl = GetHtmlImgUrl(imgOldHtml);
//得到有效的连接地址
string imgValidUrl = GetValidUrl(imgOldUrl, domainUrl);
//得到图片名称
string imgName = GetImgNameAndSuffix(imgValidUrl, true);
//下载图片
DownLoadImgToLocal(imgValidUrl, imgSavePath + imgName);
string imgNewHtml = OverWriteImgUrlInHtml(imgOldHtml, imgSavePath, "十一宽频");
//替换图片原连接地址为新连接地址
returnStr = returnStr.Replace(imgOldHtml, imgNewHtml);
}
returnStr = returnStr.Replace(">>", ">");
return returnStr;
}
#endregion
/// <summary>
/// 获取网页内容
/// </summary>
/// <param name="url">网页路径</param>
/// <returns></returns>
public static string getWebHtmlCotent(string url)
{
try
{
byte[] b_text = new System.Net.WebClient().DownloadData(url);
return System.Text.Encoding.Default.GetString(b_text);
}
catch
{
return "";
}
}
/// <summary>
/// 获取网页内容
/// </summary>
/// <param name="url">网页路径</param>
/// <param name="encode">编码方式</param>
/// <returns></returns>
public static string getWebHtmlCotent(string url, System.Text.Encoding encode)
{
try
{
byte[] b_text = new System.Net.WebClient().DownloadData(url);
return encode.GetString(b_text);
}
catch
{
return "";
}
}
/// <summary>
/// 清除所有HTML标记
/// </summary>
/// <param name="HtmlContents"></param>
/// <returns></returns>
public static string getClearHtmlCode(string HtmlContents)
{
HtmlContents = HtmlContents.Replace(" ", "").Replace("\t", "").Replace("\r\n", "");
//先清除js
HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, " ", "", RegexOptions.IgnoreCase);
return HtmlContents;
}
/// <summary>
/// 清除 HTML 标记中的图片
/// </summary>
/// <param name="HtmlContents"></param>
/// <returns></returns>
public static string doClearHtmlTagSaveImg(string HtmlContents)
{
string Contents = HtmlContents;
Match m;
Match m1;
Regex r = new Regex("<img.*?>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
for (m = r.Match(Contents); m.Success; m = m.NextMatch())
{
string tempstr = m.Groups[0].ToString();
string oldImgTag = tempstr;
string newImgTag = "";
Regex r1 = new Regex("src=\".*?\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);
for (m1 = r1.Match(tempstr); m1.Success; )
{
newImgTag = m1.Groups[0].ToString();
break;
}
if (newImgTag != "")
{
newImgTag = "<t;img " + newImgTag + "&rtt;";
HtmlContents = HtmlContents.Replace(oldImgTag, newImgTag) + "<br>";
}
}
HtmlContents = Regex.Replace(HtmlContents, "\r\n", "", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<br>", "<t;br /&rtt;", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<br >", "<t;br /&rtt;", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<br />", "<t;br /&rtt;", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "</p>", "<t;br /&rtt;", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, " \r\n", "", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);
HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);
HtmlContents = HtmlContents.Replace("<t;", "<").Replace("&rtt;", ">");
HtmlContents = HtmlContents.Replace(" ", " ").Replace(" ", " ").Replace(" ", " ").Replace(" ", " ").Replace("\t", "");
HtmlContents = HtmlContents.Replace("<br /><br /><br /><br />", "<br />").Replace("<br /><br /><br />", "<br />").Replace("<br /><br />", "<br />");
HtmlContents = HtmlContents.Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />");
return HtmlContents;
}
}
}