asp.net 抓取新闻
前台页面:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head id="Head1" runat="server"> <title></title> <script type="text/javascript" src="http://code.jquery.com/jquery-1.7.2.js"></script> </head> <body> <form id="form1" runat="server"> <div> <asp:Panel ID="Panel1" runat="server"> 网址:<asp:TextBox ID="txtUrl" runat="server"></asp:TextBox> <asp:Button ID="btnGet" runat="server" Text="RSS" OnClick="btnGet_Click" /> </asp:Panel> </div> </form> </body> </html>
后台代码:
using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Text; using System.Net; using System.Text.RegularExpressions; public partial class Default7 : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { } protected void btnGet_Click(object sender, EventArgs e) { string strurl = txtUrl.Text.ToString(); //欲获取的网页地址 要 http:// WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 myWebClient.Credentials = CredentialCache.DefaultCredentials; //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) byte[] pagedata = myWebClient.DownloadData(@strurl); //string result = Encoding.Default.GetString(pagedata); //如果获取网站页面采用的是GB2312,则使用这句 string result = Encoding.GetEncoding("GB2312").GetString(pagedata); //如果获取网站页面采用的是UTF-8,则使用这句 Regex regex = new Regex("<div class=\"mark\">([\\s\\S]*)</div>([\\s\\S]*)<div id=\"ajax_page\">", RegexOptions.Compiled); Match match= regex.Match(result); if (match.Success) { result = match.Groups[0].Value; } Response.Write(result); //在WEB页中显示获取的内容 Panel1.Visible = false; } /// <summary> /// 返回分析数据 /// </summary> /// <param name="regexString">正则</param> /// <param name="html">HTML</param> /// <param name="group">分组长度</param> /// <returns>数据</returns> //public List<string> GetData(string regexString, string html, int group) //{ // List<string> result = new List<string>(); // Regex regex = new Regex(regexString, RegexOptions.IgnoreCase); // MatchCollection mc = regex.Matches(html); // for (int count = 0; count < mc.Count; count++) // { // Match m = mc[count]; // for (int index = 0; m.Groups[index].Value != ""; index++) // { // string value = m.Groups[index].Value; // if (count % group != 2) // value = Regex.Replace(value, "&", ""); // if (value == "") // { // result.RemoveRange((result.Count / group) * group, result.Count % group); // count = (count / group) * group + group - 1; // break; // } // result.Add(value); // } // } // return result; //} /// <summary> /// 返回分析数据 /// </summary> /// <param name="regexString">正则</param> /// <param name="html">HTML</param> /// <returns>数据</returns> public List<string> GetData(string regexString, string html) { List<string> result = new List<string>(); Regex regex = new Regex(regexString, RegexOptions.IgnoreCase); MatchCollection mc = regex.Matches(html); for (int count = 0; count < mc.Count; count++) { Match m = mc[count]; for (int index = 0; m.Groups[index].Value != ""; index++) { result.Add(m.Groups[index].Value); } } return result; } }