Code
<form id="form1" runat="server">
<div id="div_html" runat="server">
</div>
<input type="button" id="btnok" runat="server" value="获取温度" onserverclick="btnok_ServerClick"/>
<asp:Button ID="btnNews" runat="server" OnClick="btnNews_Click" Text="新闻" />
<table>
<asp:Repeater ID="rptNews" runat="server">
<ItemTemplate>
<tr>
<td>ID</td>
<td>Name</td>
<td>Pic</td>
</tr>
<tr>
<td><%#Eval("ID") %></td>
<td><%#Eval("Name") %></td>
<td><%#Eval("Pic")%></td>
</tr>
</ItemTemplate>
</asp:Repeater>
</table>
</form>
<form id="form1" runat="server">
<div id="div_html" runat="server">
</div>
<input type="button" id="btnok" runat="server" value="获取温度" onserverclick="btnok_ServerClick"/>
<asp:Button ID="btnNews" runat="server" OnClick="btnNews_Click" Text="新闻" />
<table>
<asp:Repeater ID="rptNews" runat="server">
<ItemTemplate>
<tr>
<td>ID</td>
<td>Name</td>
<td>Pic</td>
</tr>
<tr>
<td><%#Eval("ID") %></td>
<td><%#Eval("Name") %></td>
<td><%#Eval("Pic")%></td>
</tr>
</ItemTemplate>
</asp:Repeater>
</table>
</form>
Code
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Text;
namespace sqlhelper
{
public partial class WebForm2 : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
#region 抓天气
protected void btnok_ServerClick(object sender, EventArgs e)
{
string Htm = _GetHtml(@"http://php.weather.sina.com.cn/search.php?city=北京");
string pat_city = "<div class=\"City_Data\">" + @"(?:.|\n)*?<h3>(?:[\u4e00-\u9fa5]+)(?:\s+)([\u4e00-\u9fa5]+)</h3>";
string pat_date = "<div class=\"City_Data\">" + @"(?:.|\n)*?<p>(?:\d+[\u4e00-\u9fa5]+)+?-(?:\d+[\u4e00-\u9fa5]+) ([\u4e00-\u9fa5]+)</p>";
string pat_wea = "<div class=\"Weather_TP\">" + @"[\u4e00-\u9fa5]+\s+(-?\d+℃~-?\d+℃)</div>";
string str_city = Regex.Match(Htm, pat_city).Groups[1].Value;
string str_date = Regex.Match(Htm, pat_date).Groups[1].Value;
string str_wea = Regex.Match(Htm, pat_wea).Groups[1].Value;
div_html.InnerHtml = str_city + str_date + str_wea;
}
public static string _GetHtml(string Url)
{
Stream MyInStream = null;
string Html = "";
try
{
HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();
MyInStream = MyResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.Default; //这里编码为:utf-8//System.Text.Encoding.Default;
StreamReader sr = new StreamReader(MyInStream, encode);
Char[] read = new Char[256];
int count = sr.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
Html += str;
count = sr.Read(read, 0, 256);
}
}
catch (Exception)
{
Html = "错误";
}
finally
{
if (MyInStream != null)
{
MyInStream.Close();
}
}
return Html;
}
#endregion
#region 抓新闻
protected void btnNews_Click(object sender, EventArgs e)
{
rptNews.DataSource =GetHtmlCode(@"http://bbs.le8le.com/forumdisplay.php?fid=62", 7);
//rptNews.DataSource = GetHtmlCode(@"http://news.sina.com.cn/china/", 3);
rptNews.DataBind();
}
DataTable dt = new DataTable();
private void Setdt()
{
dt.Columns.Add("ID");
dt.Columns.Add("Name");
dt.Columns.Add("URL");
dt.Columns.Add("Pic");
}
/// <summary>
/// 采集数据
/// </summary>
/// <param name="Url">需要采集的URL地址</param>
/// <param name="i">采集多少条</param>
/// <returns>DataTable</returns>
public DataTable GetHtmlCode(string Url, int i)
{
string strHtml = "";;
strHtml = GetURlHTML(Url);
try
{
Setdt();
//关键所在正则表达式,[\\S\\s]*? 匹配所有字符
//<span id="thread_2224"><a href="viewthread.php?tid=2224&extra=page%3D1">2008-09赛季前瞻</a></span>所有满足这种条件的数据
string TempRegex = "<span id=\"([\\S\\s]*?)\"><a href=\"([\\S\\s]*?)\">([\\S\\s]*?)</a></span>";
Regex regex = new Regex(TempRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);
//string TempRegex = @"<table width=320 cellspacing=0 style='margin:7px 0 7px 0'>(?:.|\n)*?<a\s+href=(.*?)\s+target=_blank>(.*?)</a>(.|\n)*?</table>";
//string pat_htm = Regex.Match(strHtml, TempRegex).Groups[0].Value; //先把要抓取的一段提出来
////一个一个匹配 <a\s+href=(.*?)(?=\s+target=_blank)(.|\n)*?</a> (?=\s+target=_blank)后面没有>不然不能匹配
//string pat_href_text = @"<a\s+href=(.*?)\s+target=_blank>(.*?)</a>";
//Regex regex = new Regex(pat_href_text, RegexOptions.Compiled | RegexOptions.IgnoreCase);
int x = 1;
i++;
for (Match match = regex.Match(strHtml); match.Success; match = match.NextMatch())
{
if (x < i)
{
DataRow datarow = dt.NewRow();
datarow["ID"] = x;
//datarow["URL"] = match.Groups[1].Value;
//datarow["Name"] = match.Groups[2].Value;
datarow["Name"] = match.Groups[3].ToString();
datarow["Url"] = GetBBSUrl(Url, match.Groups[2].ToString());
datarow["Pic"] = datarow["Url"].ToString();
dt.Rows.Add(datarow);
dt.AcceptChanges();
x++;
}
else
{
break;
}
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return dt;
}
/// <summary>
/// 获取一条BBS连接的真正外网地址
/// </summary>
/// <param name="Url">要采集的网址</param>
/// <param name="BBSUrl">一条BBS连接</param>
/// <returns>一条BBS连接真正地址</returns>
private string GetBBSUrl(string Url, string BBSUrl)
{
string TempUrl = Url.Substring(0, Url.LastIndexOf("/") + 1);
return TempUrl + BBSUrl;
}
/// <summary>
/// 获取URL的HTML
/// </summary>
/// <param name="Url">URL地址</param>
/// <returns>HTML代码</returns>
private string GetURlHTML(string Url)
{
string strHtml = "";
try
{
StreamReader sr = null; //用来读取流
System.Text.Encoding code = System.Text.Encoding.Default; //定义编码
//构造web请求,发送请求,获取响应
WebRequest HttpWebRequest = null;
WebResponse HttpWebResponse = null;
HttpWebRequest = WebRequest.Create(Url);
HttpWebResponse = HttpWebRequest.GetResponse();
//获得流
sr = new StreamReader(HttpWebResponse.GetResponseStream(), code);
strHtml = sr.ReadToEnd();
sr.Close();
//sr.Dispose();
HttpWebResponse.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return strHtml;
}
/// <summary>
/// 获取一条BBS连接内力的图片和图片地址
/// </summary>
/// <param name="HtmlCode">HtmlCode代码</param>
/// <param name="regex1"></param>
/// <returns></returns>
private string SaveImgToLocal(string URl)
{
DataTable dtimg = new DataTable();
dtimg.Columns.Add("imgpath");
//string HTML = GetURlHTML(URl);
string HTML = GetURlHTML(@"http://bbs.le8le.com/viewthread.php?tid=2223&extra=page%3D1");
string Temp = "<img.*?src=\"(.*?)\".*?/>";//HTML-htm-pat
//HTML-htm 用(?=exp的方法来确定末尾在那里)
string Contm = "<div id=\"postmessage_8769\" class=\"t_msgfont\">(.|\\n)*</div>(?=(.|\\n)*?<div id=\"post_rate_div_8769\">)";
Regex regex = new Regex(Contm, RegexOptions.Compiled | RegexOptions.IgnoreCase);
string htm = regex.Match(HTML).Groups[0].Value;
string PicUrl = "";
regex = new Regex(Temp, RegexOptions.Compiled | RegexOptions.IgnoreCase);
//PicUrl = regex.Match(htm).Groups[1].Value;
for (Match match = regex.Match(htm); match.Success; match = match.NextMatch())
{
DataRow dtimgrow = dtimg.NewRow();
dtimgrow["imgpath"] = match.Groups[1].Value;
dtimg.Rows.Add(dtimgrow);
dtimg.AcceptChanges();
}
return PicUrl;
}
#endregion
}
}
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Text;
namespace sqlhelper
{
public partial class WebForm2 : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
#region 抓天气
protected void btnok_ServerClick(object sender, EventArgs e)
{
string Htm = _GetHtml(@"http://php.weather.sina.com.cn/search.php?city=北京");
string pat_city = "<div class=\"City_Data\">" + @"(?:.|\n)*?<h3>(?:[\u4e00-\u9fa5]+)(?:\s+)([\u4e00-\u9fa5]+)</h3>";
string pat_date = "<div class=\"City_Data\">" + @"(?:.|\n)*?<p>(?:\d+[\u4e00-\u9fa5]+)+?-(?:\d+[\u4e00-\u9fa5]+) ([\u4e00-\u9fa5]+)</p>";
string pat_wea = "<div class=\"Weather_TP\">" + @"[\u4e00-\u9fa5]+\s+(-?\d+℃~-?\d+℃)</div>";
string str_city = Regex.Match(Htm, pat_city).Groups[1].Value;
string str_date = Regex.Match(Htm, pat_date).Groups[1].Value;
string str_wea = Regex.Match(Htm, pat_wea).Groups[1].Value;
div_html.InnerHtml = str_city + str_date + str_wea;
}
public static string _GetHtml(string Url)
{
Stream MyInStream = null;
string Html = "";
try
{
HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();
MyInStream = MyResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.Default; //这里编码为:utf-8//System.Text.Encoding.Default;
StreamReader sr = new StreamReader(MyInStream, encode);
Char[] read = new Char[256];
int count = sr.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
Html += str;
count = sr.Read(read, 0, 256);
}
}
catch (Exception)
{
Html = "错误";
}
finally
{
if (MyInStream != null)
{
MyInStream.Close();
}
}
return Html;
}
#endregion
#region 抓新闻
protected void btnNews_Click(object sender, EventArgs e)
{
rptNews.DataSource =GetHtmlCode(@"http://bbs.le8le.com/forumdisplay.php?fid=62", 7);
//rptNews.DataSource = GetHtmlCode(@"http://news.sina.com.cn/china/", 3);
rptNews.DataBind();
}
DataTable dt = new DataTable();
private void Setdt()
{
dt.Columns.Add("ID");
dt.Columns.Add("Name");
dt.Columns.Add("URL");
dt.Columns.Add("Pic");
}
/// <summary>
/// 采集数据
/// </summary>
/// <param name="Url">需要采集的URL地址</param>
/// <param name="i">采集多少条</param>
/// <returns>DataTable</returns>
public DataTable GetHtmlCode(string Url, int i)
{
string strHtml = "";;
strHtml = GetURlHTML(Url);
try
{
Setdt();
//关键所在正则表达式,[\\S\\s]*? 匹配所有字符
//<span id="thread_2224"><a href="viewthread.php?tid=2224&extra=page%3D1">2008-09赛季前瞻</a></span>所有满足这种条件的数据
string TempRegex = "<span id=\"([\\S\\s]*?)\"><a href=\"([\\S\\s]*?)\">([\\S\\s]*?)</a></span>";
Regex regex = new Regex(TempRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);
//string TempRegex = @"<table width=320 cellspacing=0 style='margin:7px 0 7px 0'>(?:.|\n)*?<a\s+href=(.*?)\s+target=_blank>(.*?)</a>(.|\n)*?</table>";
//string pat_htm = Regex.Match(strHtml, TempRegex).Groups[0].Value; //先把要抓取的一段提出来
////一个一个匹配 <a\s+href=(.*?)(?=\s+target=_blank)(.|\n)*?</a> (?=\s+target=_blank)后面没有>不然不能匹配
//string pat_href_text = @"<a\s+href=(.*?)\s+target=_blank>(.*?)</a>";
//Regex regex = new Regex(pat_href_text, RegexOptions.Compiled | RegexOptions.IgnoreCase);
int x = 1;
i++;
for (Match match = regex.Match(strHtml); match.Success; match = match.NextMatch())
{
if (x < i)
{
DataRow datarow = dt.NewRow();
datarow["ID"] = x;
//datarow["URL"] = match.Groups[1].Value;
//datarow["Name"] = match.Groups[2].Value;
datarow["Name"] = match.Groups[3].ToString();
datarow["Url"] = GetBBSUrl(Url, match.Groups[2].ToString());
datarow["Pic"] = datarow["Url"].ToString();
dt.Rows.Add(datarow);
dt.AcceptChanges();
x++;
}
else
{
break;
}
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return dt;
}
/// <summary>
/// 获取一条BBS连接的真正外网地址
/// </summary>
/// <param name="Url">要采集的网址</param>
/// <param name="BBSUrl">一条BBS连接</param>
/// <returns>一条BBS连接真正地址</returns>
private string GetBBSUrl(string Url, string BBSUrl)
{
string TempUrl = Url.Substring(0, Url.LastIndexOf("/") + 1);
return TempUrl + BBSUrl;
}
/// <summary>
/// 获取URL的HTML
/// </summary>
/// <param name="Url">URL地址</param>
/// <returns>HTML代码</returns>
private string GetURlHTML(string Url)
{
string strHtml = "";
try
{
StreamReader sr = null; //用来读取流
System.Text.Encoding code = System.Text.Encoding.Default; //定义编码
//构造web请求,发送请求,获取响应
WebRequest HttpWebRequest = null;
WebResponse HttpWebResponse = null;
HttpWebRequest = WebRequest.Create(Url);
HttpWebResponse = HttpWebRequest.GetResponse();
//获得流
sr = new StreamReader(HttpWebResponse.GetResponseStream(), code);
strHtml = sr.ReadToEnd();
sr.Close();
//sr.Dispose();
HttpWebResponse.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return strHtml;
}
/// <summary>
/// 获取一条BBS连接内力的图片和图片地址
/// </summary>
/// <param name="HtmlCode">HtmlCode代码</param>
/// <param name="regex1"></param>
/// <returns></returns>
private string SaveImgToLocal(string URl)
{
DataTable dtimg = new DataTable();
dtimg.Columns.Add("imgpath");
//string HTML = GetURlHTML(URl);
string HTML = GetURlHTML(@"http://bbs.le8le.com/viewthread.php?tid=2223&extra=page%3D1");
string Temp = "<img.*?src=\"(.*?)\".*?/>";//HTML-htm-pat
//HTML-htm 用(?=exp的方法来确定末尾在那里)
string Contm = "<div id=\"postmessage_8769\" class=\"t_msgfont\">(.|\\n)*</div>(?=(.|\\n)*?<div id=\"post_rate_div_8769\">)";
Regex regex = new Regex(Contm, RegexOptions.Compiled | RegexOptions.IgnoreCase);
string htm = regex.Match(HTML).Groups[0].Value;
string PicUrl = "";
regex = new Regex(Temp, RegexOptions.Compiled | RegexOptions.IgnoreCase);
//PicUrl = regex.Match(htm).Groups[1].Value;
for (Match match = regex.Match(htm); match.Success; match = match.NextMatch())
{
DataRow dtimgrow = dtimg.NewRow();
dtimgrow["imgpath"] = match.Groups[1].Value;
dtimg.Rows.Add(dtimgrow);
dtimg.AcceptChanges();
}
return PicUrl;
}
#endregion
}
}