href= 正则表达式

获取网页的新闻连接:

 

代码
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
namespace WebApplication4
{
    
public partial class _Default : System.Web.UI.Page
    {
        
protected void Page_Load(object sender, EventArgs e)
        {
            
string strNewsUrl = ConfigurationSettings.AppSettings["NewsUrl"];
            
string strHTML = DownLoadHtml(strNewsUrl);

            
int begin = 0;
            
string strBegin = ConfigurationSettings.AppSettings["BeginStr"];
            
string strEnd = ConfigurationSettings.AppSettings["EndStr"];
            
string strContent = "";
            
if (strBegin.Trim() != "" && strEnd.Trim() != "")
                strContent 
= GetHTMLContent(strHTML, strBegin, strEnd, ref begin);
            
else
                strContent 
= strHTML;
            MatchCollection ms 
= GetUrlFromHtml(strContent);
           
if (ms.Count > 0)
                {
                    Response.Write(
"<marquee onmouseover=this.stop() onmouseout=this.start() scrollDelay=110 class=a><font color=#ff0000>最新消息:</font>");

                    
foreach (Match m in ms)
                    {
                        
if (m.Groups[1].Value.Trim() != "" && m.Groups[3].Value.Trim() != "")
                        {

                           
string href = string.Format("<a href='{0}' title='{2}'  target='_blank'>{1}</a>&nbsp;&nbsp;&nbsp;&nbsp;", m.Groups[1].Value.Trim(), m.Groups[3].Value.Trim(), m.Groups[3].Value.Trim());

                            Response.Write(href);
                        }
                    }
                    Response.Write(
"</marquee>");
                }
        }
      
        
#region 获取网页内容

        
public static string DownLoadHtml(string url)
        {
            
string output = "";
            Encoding encode 
= Encoding.UTF8;
            WebClient webclient 
= new WebClient();
            
//System.Net.GlobalProxySelection.Select = System.Net.GlobalProxySelection.GetEmptyWebProxy();

            
try
            {
        //如果使用代理上网,则使用如下方法,默认是使用IE代理设置
               //webclient.Proxy = new WebProxy("192.168.8.1"808);
                webclient.Headers.Add(
"Referer", url);

                
byte[] buff = webclient.DownloadData(url);
                output 
= encode.GetString(buff);
            }
            
catch
            {
            }
            
return output;
        }

        
public static string GetHTMLContent(string strTarget, string strBegin, string strEnd, ref int begin)
        {
            
string result;
            
int posBegin, posEnd;
            posBegin 
= strTarget.IndexOf(strBegin, begin);
            
if (posBegin != -1)
            {
                posEnd 
= strTarget.IndexOf(strEnd, posBegin + strBegin.Length);
                
if (posEnd > posBegin)
                {
                    result 
= strTarget.Substring(posBegin, posEnd + strEnd.Length - posBegin);
                    begin 
= posEnd + strEnd.Length;
                    
return result;

                }
            }
            begin 
= -1;
            
return "";
        }

        
//过滤特殊字符
        public static string RepalceStr(string str)
        {

            str 
= str.Replace("\r\n""");
            str 
= str.Replace("\"""");
            str = str.Replace("""");
            str 
= str.Replace("\t""");
            str 
= str.Replace("&nbsp;""");
            str 
= str.Replace("'""");
            str 
= str.Replace("\r""");
            str 
= str.Replace("\n""");
            
return str;
        }

        
public static MatchCollection GetUrlFromHtml(string strContent)
        {
            
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
            
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
            
//string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+))\s*(.*)\s*title=[""|'](?<remarktext>[\s\S])[""|'].*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";

            
//string regex= "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
            
//string regex = "<a.*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^\\>^\\s]+)).*\\>[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>"; 

            
//string regex = "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]*?title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
            
//string regex = "href=['|\"](?<url>[\\s\\S]*?)['|\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";

            
//string regex = "href=(\'|\\\\\")(?<url>[\\s\\S]*?)(\'|\\\\\")[\\s\\S]*?title=\\\\\"(?<remarktext>[\\s\\S]*?)\\\\\"[\\s" +"\\S]*?>(?<title>[\\s\\S]*?)</a>";
            string regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\" target=\"([^<\"]+)\" title=\"([^<\"]+)\"><span class=\"([^<\"]+)\">([^<\"]+)</span></a>";  
          
            
//string p = "href=[\"](?<url>[\\s\\S]*?)[\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";

            Regex reg 
= new Regex(regex, RegexOptions.IgnoreCase);
            MatchCollection ms 
= reg.Matches(strContent);
            
return ms;

        }

        
public static string GetNewsUrl()
        {
            
if (ConfigurationSettings.AppSettings["NewsUrl"== "")
                
return "";
            
else
                
return "../desktop/getnewsUrl.aspx";

        }
        
#endregion
    }
}

 

web.config源码:

 

代码
    <appSettings>
        
<!--新闻获取网址-->
        
<add key="NewsUrl" value="http://jyj.cixi.gov.cn/"/>
        
<add key="BeginStr" value="var teshu_shu = 1;"/>
        
<add key="EndStr" value="/script"/>
        
<add key="expressMatch"/>
       
</appSettings>

 

 

posted on 2010-11-22 15:25  不悔的青春  阅读(3700)  评论(0编辑  收藏  举报

导航