根据当前页面url匹配出页面内链接地址的完整路径
该类其功能相当于浏览器对当前页面内链接地址的解析功能
主要方法是 字符串的操作,正则表达式的匹配和替换。
比如所当前页为:http://www.test.com/list/page.aspx?id=12,其页面内链接为:
BaseUrl | Result |
/default.aspx?id=14 | http://www.test.com/default.aspx?id=14 |
../details.aspx?id=4 | http://www.test.com/details.aspx?id=4 |
dete.aspx | http://www.test.com/list/dete.aspx |
该类C#代码:
using System; using System; using System.Collections; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; public class Utility { /// <summary> /// 处理URL地址,当BranchUrl为一个全名的URL时则返回本身,否则恰当的衔接到BaseUrl后面 /// </summary> /// <param name="BaseUrl">完整的URL http://www.test.com/list/page.aspx </param> /// <param name="BranchUrl">分支URL ../test.aspx</param> /// <returns></returns> public static string StickUrl(string BaseUrl, string BranchUrl) { //如果BranchUrl完整路径则返回 if (Regex.Match(BranchUrl, @"^(http|https|ftp|rtsp|mms)://", RegexOptions.IgnoreCase | RegexOptions.Compiled).Success) { return BranchUrl; } else { BaseUrl = BaseUrl.Replace("\\", "/"); BranchUrl = BranchUrl.Replace("\\", "/"); //如果BranchUrl相对路径为根目录 ep:“/test.aspx” if (BranchUrl.Trim().IndexOf("/") == 0) { return GetLastUrl(BaseUrl, BranchUrl); } //开头不是 "../" if (BranchUrl.IndexOf("../") != 0) { return UrlPlus(BaseUrl, BranchUrl); } else { //各种正则匹配 if (Regex.Match(BaseUrl, @"/$", RegexOptions.Compiled).Success) { BaseUrl = BaseUrl.TrimEnd('/'); } else if (Regex.Match(BaseUrl, @"/[^\./]+\.[^/]+$", RegexOptions.Compiled).Success) { BaseUrl = Regex.Replace(BaseUrl, @"/[^\./]+\.[^/]+$", "", RegexOptions.Compiled); } while (BranchUrl.IndexOf("../") >= 0) { BranchUrl = Regex.Replace(BranchUrl, @"^\.\./", "", RegexOptions.Compiled); BaseUrl = Regex.Replace(BaseUrl, @"/[^/]*$", "", RegexOptions.Compiled); break; } return BaseUrl + "/" + BranchUrl; } } } /// <summary> /// BranchUrl相对路径为根目录 "/test.aspx" /// </summary> /// <param name="BaseUrl">当前页面地址 http://www.test.com/list/page.aspx</param> /// <param name="BranchUrl">页面内链接地址 "/test.aspx"</param> /// <returns></returns> private static string GetLastUrl(string BaseUrl, string BranchUrl) { BranchUrl = BranchUrl.TrimStart('/'); //移除 "/test.aspx" 中根目录符号"/" string Star_url = ""; string End_Url = BaseUrl; //如果包含协议类型 if (BaseUrl.IndexOf("//") > 0) { BaseUrl = BaseUrl.Replace("//", "|"); // http:|www.test.com/list/page.aspx string[] Url_Arr = BaseUrl.Split('|'); //分割数组 Star_url = Url_Arr[0].ToString(); //协议类型:“http:” End_Url = Url_Arr[1].ToString(); //域名地址:www.test.com/list/page.aspx } if (End_Url.IndexOf("/") > 0) { string[] End_Arr = End_Url.Split('/');//分割数组 :www.test.com/list/page.aspx End_Url = End_Arr[0].ToString(); //域名:www.test.com } if (Star_url != string.Empty)//协议类型不为空 { return Star_url + "//" + End_Url + "/" + BranchUrl; // http: + // + www.test.com + / + BranchUrl } else { return End_Url + "/" + BranchUrl; // www.test.com + / + BranchUrl } } /// <summary> /// BranchUrl相对路径包为 单独页面 ep "test.aspx" /// </summary> /// <param name="front">当前页面地址 ep http://www.test.com/list/page.aspx </param> /// <param name="tail">页面内链接地址 "test.aspx"</param> /// <returns></returns> private static string UrlPlus(string front, string tail) { //判断各种不同的当前页面地址 if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + "/" + tail; } else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + tail; } else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://.+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + tail; } else if (Regex.Match(front, @"/[^/\.]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + "/" + tail; } else if (Regex.Match(front, @"/[^/\.]+\.[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return Regex.Replace(front, @"/[^/\.]+\.[^/]+$", "", RegexOptions.IgnoreCase | RegexOptions.Compiled) + "/" + tail; } else { return front + "/" + tail; } } }
主要功能借鉴于:dotNETCMSv1.0sp5 CMS。源码下载地址:http://www.51aspx.com/CV/dotNETCMS10sp5
在数据采集的时候,在文章列表页中匹配出文章内容页的完整路径。