href= 正则表达式
获取网页的新闻连接:
代码
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
namespace WebApplication4
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string strNewsUrl = ConfigurationSettings.AppSettings["NewsUrl"];
string strHTML = DownLoadHtml(strNewsUrl);
int begin = 0;
string strBegin = ConfigurationSettings.AppSettings["BeginStr"];
string strEnd = ConfigurationSettings.AppSettings["EndStr"];
string strContent = "";
if (strBegin.Trim() != "" && strEnd.Trim() != "")
strContent = GetHTMLContent(strHTML, strBegin, strEnd, ref begin);
else
strContent = strHTML;
MatchCollection ms = GetUrlFromHtml(strContent);
if (ms.Count > 0)
{
Response.Write("<marquee onmouseover=this.stop() onmouseout=this.start() scrollDelay=110 class=a><font color=#ff0000>最新消息:</font>");
foreach (Match m in ms)
{
if (m.Groups[1].Value.Trim() != "" && m.Groups[3].Value.Trim() != "")
{
string href = string.Format("<a href='{0}' title='{2}' target='_blank'>{1}</a> ", m.Groups[1].Value.Trim(), m.Groups[3].Value.Trim(), m.Groups[3].Value.Trim());
Response.Write(href);
}
}
Response.Write("</marquee>");
}
}
#region 获取网页内容
public static string DownLoadHtml(string url)
{
string output = "";
Encoding encode = Encoding.UTF8;
WebClient webclient = new WebClient();
//System.Net.GlobalProxySelection.Select = System.Net.GlobalProxySelection.GetEmptyWebProxy();
try
{
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
namespace WebApplication4
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string strNewsUrl = ConfigurationSettings.AppSettings["NewsUrl"];
string strHTML = DownLoadHtml(strNewsUrl);
int begin = 0;
string strBegin = ConfigurationSettings.AppSettings["BeginStr"];
string strEnd = ConfigurationSettings.AppSettings["EndStr"];
string strContent = "";
if (strBegin.Trim() != "" && strEnd.Trim() != "")
strContent = GetHTMLContent(strHTML, strBegin, strEnd, ref begin);
else
strContent = strHTML;
MatchCollection ms = GetUrlFromHtml(strContent);
if (ms.Count > 0)
{
Response.Write("<marquee onmouseover=this.stop() onmouseout=this.start() scrollDelay=110 class=a><font color=#ff0000>最新消息:</font>");
foreach (Match m in ms)
{
if (m.Groups[1].Value.Trim() != "" && m.Groups[3].Value.Trim() != "")
{
string href = string.Format("<a href='{0}' title='{2}' target='_blank'>{1}</a> ", m.Groups[1].Value.Trim(), m.Groups[3].Value.Trim(), m.Groups[3].Value.Trim());
Response.Write(href);
}
}
Response.Write("</marquee>");
}
}
#region 获取网页内容
public static string DownLoadHtml(string url)
{
string output = "";
Encoding encode = Encoding.UTF8;
WebClient webclient = new WebClient();
//System.Net.GlobalProxySelection.Select = System.Net.GlobalProxySelection.GetEmptyWebProxy();
try
{
//如果使用代理上网,则使用如下方法,默认是使用IE代理设置
//webclient.Proxy = new WebProxy("192.168.8.1", 808);
webclient.Headers.Add("Referer", url);
byte[] buff = webclient.DownloadData(url);
output = encode.GetString(buff);
}
catch
{
}
return output;
}
public static string GetHTMLContent(string strTarget, string strBegin, string strEnd, ref int begin)
{
string result;
int posBegin, posEnd;
posBegin = strTarget.IndexOf(strBegin, begin);
if (posBegin != -1)
{
posEnd = strTarget.IndexOf(strEnd, posBegin + strBegin.Length);
if (posEnd > posBegin)
{
result = strTarget.Substring(posBegin, posEnd + strEnd.Length - posBegin);
begin = posEnd + strEnd.Length;
return result;
}
}
begin = -1;
return "";
}
//过滤特殊字符
public static string RepalceStr(string str)
{
str = str.Replace("\r\n", "");
str = str.Replace("\"", "");
str = str.Replace(":", "");
str = str.Replace("\t", "");
str = str.Replace(" ", "");
str = str.Replace("'", "");
str = str.Replace("\r", "");
str = str.Replace("\n", "");
return str;
}
public static MatchCollection GetUrlFromHtml(string strContent)
{
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+))\s*(.*)\s*title=[""|'](?<remarktext>[\s\S])[""|'].*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex= "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "<a.*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^\\>^\\s]+)).*\\>[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]*?title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=['|\"](?<url>[\\s\\S]*?)['|\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=(\'|\\\\\")(?<url>[\\s\\S]*?)(\'|\\\\\")[\\s\\S]*?title=\\\\\"(?<remarktext>[\\s\\S]*?)\\\\\"[\\s" +"\\S]*?>(?<title>[\\s\\S]*?)</a>";
string regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\" target=\"([^<\"]+)\" title=\"([^<\"]+)\"><span class=\"([^<\"]+)\">([^<\"]+)</span></a>";
//string p = "href=[\"](?<url>[\\s\\S]*?)[\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
MatchCollection ms = reg.Matches(strContent);
return ms;
}
public static string GetNewsUrl()
{
if (ConfigurationSettings.AppSettings["NewsUrl"] == "")
return "";
else
return "../desktop/getnewsUrl.aspx";
}
#endregion
}
}
webclient.Headers.Add("Referer", url);
byte[] buff = webclient.DownloadData(url);
output = encode.GetString(buff);
}
catch
{
}
return output;
}
public static string GetHTMLContent(string strTarget, string strBegin, string strEnd, ref int begin)
{
string result;
int posBegin, posEnd;
posBegin = strTarget.IndexOf(strBegin, begin);
if (posBegin != -1)
{
posEnd = strTarget.IndexOf(strEnd, posBegin + strBegin.Length);
if (posEnd > posBegin)
{
result = strTarget.Substring(posBegin, posEnd + strEnd.Length - posBegin);
begin = posEnd + strEnd.Length;
return result;
}
}
begin = -1;
return "";
}
//过滤特殊字符
public static string RepalceStr(string str)
{
str = str.Replace("\r\n", "");
str = str.Replace("\"", "");
str = str.Replace(":", "");
str = str.Replace("\t", "");
str = str.Replace(" ", "");
str = str.Replace("'", "");
str = str.Replace("\r", "");
str = str.Replace("\n", "");
return str;
}
public static MatchCollection GetUrlFromHtml(string strContent)
{
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+))\s*(.*)\s*title=[""|'](?<remarktext>[\s\S])[""|'].*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
//string regex= "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "<a.*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^\\>^\\s]+)).*\\>[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]*?title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=['|\"](?<url>[\\s\\S]*?)['|\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
//string regex = "href=(\'|\\\\\")(?<url>[\\s\\S]*?)(\'|\\\\\")[\\s\\S]*?title=\\\\\"(?<remarktext>[\\s\\S]*?)\\\\\"[\\s" +"\\S]*?>(?<title>[\\s\\S]*?)</a>";
string regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\" target=\"([^<\"]+)\" title=\"([^<\"]+)\"><span class=\"([^<\"]+)\">([^<\"]+)</span></a>";
//string p = "href=[\"](?<url>[\\s\\S]*?)[\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
MatchCollection ms = reg.Matches(strContent);
return ms;
}
public static string GetNewsUrl()
{
if (ConfigurationSettings.AppSettings["NewsUrl"] == "")
return "";
else
return "../desktop/getnewsUrl.aspx";
}
#endregion
}
}
web.config源码:
代码
<appSettings>
<!--新闻获取网址-->
<add key="NewsUrl" value="http://jyj.cixi.gov.cn/"/>
<add key="BeginStr" value="var teshu_shu = 1;"/>
<add key="EndStr" value="/script"/>
<add key="expressMatch"/>
</appSettings>
<!--新闻获取网址-->
<add key="NewsUrl" value="http://jyj.cixi.gov.cn/"/>
<add key="BeginStr" value="var teshu_shu = 1;"/>
<add key="EndStr" value="/script"/>
<add key="expressMatch"/>
</appSettings>
该文章经过本人整理所得,欢迎转载,转载时请加上本文地址;本文基于署名 2.5 中国大陆许可协议发布,请勿演绎或用于商业目的,但是必须保留本文的署名张志涛(包含链接如您有任何疑问或者授权方面的协商,请给我留言。