asp.net 抓取新闻

前台页面:

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head id="Head1" runat="server">
    <title></title>
    <script type="text/javascript" src="http://code.jquery.com/jquery-1.7.2.js"></script>
</head>
<body>
    <form id="form1" runat="server">
    <div>
        <asp:Panel ID="Panel1" runat="server">
            网址:<asp:TextBox ID="txtUrl" runat="server"></asp:TextBox>
            <asp:Button ID="btnGet" runat="server" Text="RSS" OnClick="btnGet_Click" />
        </asp:Panel>
    </div>
    </form>
</body>
</html>

后台代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;

public partial class Default7 : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    protected void btnGet_Click(object sender, EventArgs e)
    {
        string strurl = txtUrl.Text.ToString(); //欲获取的网页地址 要 http://
        WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
        //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
        myWebClient.Credentials = CredentialCache.DefaultCredentials;
        //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
        byte[] pagedata = myWebClient.DownloadData(@strurl);
        //string result = Encoding.Default.GetString(pagedata); //如果获取网站页面采用的是GB2312,则使用这句 
        string result = Encoding.GetEncoding("GB2312").GetString(pagedata); //如果获取网站页面采用的是UTF-8,则使用这句 
        Regex regex = new Regex("<div class=\"mark\">([\\s\\S]*)</div>([\\s\\S]*)<div id=\"ajax_page\">", RegexOptions.Compiled);
         Match match= regex.Match(result);
         if (match.Success)
         {
             result = match.Groups[0].Value;
         }
        Response.Write(result); //在WEB页中显示获取的内容
        Panel1.Visible = false;

    }

    /// <summary>
    /// 返回分析数据
    /// </summary>
    /// <param name="regexString">正则</param>
    /// <param name="html">HTML</param>
    /// <param name="group">分组长度</param>
    /// <returns>数据</returns>
    //public List<string> GetData(string regexString, string html, int group)
    //{
    //    List<string> result = new List<string>();
    //    Regex regex = new Regex(regexString, RegexOptions.IgnoreCase);
    //    MatchCollection mc = regex.Matches(html);
    //    for (int count = 0; count < mc.Count; count++)
    //    {
    //        Match m = mc[count];
    //        for (int index = 0; m.Groups[index].Value != ""; index++)
    //        {
    //            string value = m.Groups[index].Value;
    //            if (count % group != 2)
    //                value = Regex.Replace(value, "&", "");
    //            if (value == "")
    //            {
    //                result.RemoveRange((result.Count / group) * group, result.Count % group);
    //                count = (count / group) * group + group - 1;
    //                break;
    //            }
    //            result.Add(value);
    //        }
    //    }
    //    return result;
    //}

    /// <summary>
    /// 返回分析数据
    /// </summary>
    /// <param name="regexString">正则</param>
    /// <param name="html">HTML</param>
    /// <returns>数据</returns>
    public List<string> GetData(string regexString, string html)
    {
        List<string> result = new List<string>();
        Regex regex = new Regex(regexString, RegexOptions.IgnoreCase);
        MatchCollection mc = regex.Matches(html);
        for (int count = 0; count < mc.Count; count++)
        {
            Match m = mc[count];
            for (int index = 0; m.Groups[index].Value != ""; index++)
            {
                result.Add(m.Groups[index].Value);
            }
        }
        return result;
    }


}

  

posted on 2013-11-13 17:55  douqiumiao  阅读(482)  评论(0编辑  收藏  举报