最近写的一个抓取网页内容(抓百度的)
找了个兼职开发的工作,这个算是个小任务,数据采集之前没有实际做过,这段代码效率不是很高..不过勉强算是写出来撒 汗~
代码如下:
<%@ Page Language="C#" Debug="true" %>
<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>
<%@ Import Namespace="System.Text.RegularExpressions" %>
<%@ Import Namespace="System.Data.OleDb" %>
<%@ Import Namespace="System.Web" %>
<%@ Import Namespace="System.Web.UI" %>
<%@ Import Namespace="System.Web.UI.WebControls" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<script runat="server">
void InitializeCommapent()
{
this.Page.Load += new EventHandler(Page_Load);
}
protected override void OnInit(EventArgs e)
{
InitializeCommapent();
base.OnInit(e);
}
void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
int[] pn=new int[10];//页标签
int pnn = 0;
for (int i = 0; i < 10; i++)
{
if (pnn >= 100)
{
int pnn2 = pnn;
pn[i] = pnn2/10;
}
else
pn[i] = pnn;
pnn += 20;
}
lblHTML.Text = "";
for (int t = 0; t < pn.Length; t++)
{
WebRequest req = WebRequest.Create("http://yp.baidu.com/m?word=%BE%C6%B5%EA&tn=baiduyp&ct=553648128&rn=20&lm=-1&z=-1&pn="+pn[t]);
string strResult = "";
try
{
WebResponse result = req.GetResponse();
Stream ReceiveStream = result.GetResponseStream();
//StreamReader sr=new StreamReader()
Byte[] read = new Byte[512];
int bytes = ReceiveStream.Read(read, 0, 512);
while (bytes > 0)
{
Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
//lblHTML.Text = lblHTML.Text + encode.GetString(read, 0, bytes);
strResult += encode.GetString(read, 0, bytes);
bytes = ReceiveStream.Read(read, 0, 512);
}
}
catch (Exception)
{
lblHTML.Text = "检索页时出错";
}
int istart = strResult.IndexOf("<p>", 0);
int iend = strResult.IndexOf("</p>", istart);
strResult = strResult.Substring(istart, iend - istart + 20);
Regex re = new Regex(@"style=margin-left:18px>([\w\W]*?)<br><p");//遗漏最后一项
MatchCollection Results = re.Matches(Server.HtmlEncode(strResult));
foreach (Match match in Results)
{
string cn = Regex.Match(match.ToString(), @"<font class=style11>([\w\W]*?)<", RegexOptions.IgnoreCase).ToString();
cn = Regex.Match(cn, @">([\w\W]*?)<", RegexOptions.IgnoreCase).ToString().Replace(">", "").Replace("<", "") + "酒店";//名称
string prov = Regex.Match(match.ToString(), @"<font class=f14>([\w\W]*?)<", RegexOptions.IgnoreCase).ToString();
prov = Regex.Match(prov, @">([\w\W]*?)<", RegexOptions.IgnoreCase).ToString().Replace(">", "").Replace("<", "");
string[] p = prov.Split(' ');
prov = p[0].ToString();//省
string address = p[3].ToString();//地址
string postcode = Regex.Match(match.ToString(), @"<br>([\w\W]*?)</font>").ToString();
string[] ss = postcode.Split(' ');
string phone = "";
postcode = ss[ss.Length - 1].ToString();//邮编
phone = ss[ss.Length - 3].ToString();//电话
//string cn = match.ToString();
lblHTML.Text += cn + "<br/>" + prov + "<br/>" + address + "<br/>" + postcode + "<br/>" + phone + "<br/>";
//存储
OleDbConnection db = new OleDbConnection("Provider = Microsoft.Jet.OLEDB.4.0;Data Source=" + Server.MapPath("CatchResult.mdb"));
db.Open();
string str = "INSERT INTO tbResult (CN,PHONE,PROV,ADDRESS,POSTCODE) VALUES('"
+ cn + "','" + phone + "','" + prov + "','" + address + "','" + postcode + "')";
OleDbCommand cmd = new OleDbCommand(str, db);
cmd.ExecuteNonQuery();
cmd = null;
db.Close();
}
lblHTML.Text += "<br/><br/>第" + t + 1 + "页<br/><br/>";
}
}
}
</script>
<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
<title>内容抓取</title>
</head>
<body>
<form id="form1" runat="server">
<div>
<asp:Label runat=server ID="lblHTML" Rows="30" Cols="80" EnableViewState="false" Wrap="True"></asp:Label>
<br />
<br />
</div>
</form>
</body>
</html>
代码如下:
<%@ Page Language="C#" Debug="true" %>
<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>
<%@ Import Namespace="System.Text.RegularExpressions" %>
<%@ Import Namespace="System.Data.OleDb" %>
<%@ Import Namespace="System.Web" %>
<%@ Import Namespace="System.Web.UI" %>
<%@ Import Namespace="System.Web.UI.WebControls" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<script runat="server">
void InitializeCommapent()
{
this.Page.Load += new EventHandler(Page_Load);
}
protected override void OnInit(EventArgs e)
{
InitializeCommapent();
base.OnInit(e);
}
void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
int[] pn=new int[10];//页标签
int pnn = 0;
for (int i = 0; i < 10; i++)
{
if (pnn >= 100)
{
int pnn2 = pnn;
pn[i] = pnn2/10;
}
else
pn[i] = pnn;
pnn += 20;
}
lblHTML.Text = "";
for (int t = 0; t < pn.Length; t++)
{
WebRequest req = WebRequest.Create("http://yp.baidu.com/m?word=%BE%C6%B5%EA&tn=baiduyp&ct=553648128&rn=20&lm=-1&z=-1&pn="+pn[t]);
string strResult = "";
try
{
WebResponse result = req.GetResponse();
Stream ReceiveStream = result.GetResponseStream();
//StreamReader sr=new StreamReader()
Byte[] read = new Byte[512];
int bytes = ReceiveStream.Read(read, 0, 512);
while (bytes > 0)
{
Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
//lblHTML.Text = lblHTML.Text + encode.GetString(read, 0, bytes);
strResult += encode.GetString(read, 0, bytes);
bytes = ReceiveStream.Read(read, 0, 512);
}
}
catch (Exception)
{
lblHTML.Text = "检索页时出错";
}
int istart = strResult.IndexOf("<p>", 0);
int iend = strResult.IndexOf("</p>", istart);
strResult = strResult.Substring(istart, iend - istart + 20);
Regex re = new Regex(@"style=margin-left:18px>([\w\W]*?)<br><p");//遗漏最后一项
MatchCollection Results = re.Matches(Server.HtmlEncode(strResult));
foreach (Match match in Results)
{
string cn = Regex.Match(match.ToString(), @"<font class=style11>([\w\W]*?)<", RegexOptions.IgnoreCase).ToString();
cn = Regex.Match(cn, @">([\w\W]*?)<", RegexOptions.IgnoreCase).ToString().Replace(">", "").Replace("<", "") + "酒店";//名称
string prov = Regex.Match(match.ToString(), @"<font class=f14>([\w\W]*?)<", RegexOptions.IgnoreCase).ToString();
prov = Regex.Match(prov, @">([\w\W]*?)<", RegexOptions.IgnoreCase).ToString().Replace(">", "").Replace("<", "");
string[] p = prov.Split(' ');
prov = p[0].ToString();//省
string address = p[3].ToString();//地址
string postcode = Regex.Match(match.ToString(), @"<br>([\w\W]*?)</font>").ToString();
string[] ss = postcode.Split(' ');
string phone = "";
postcode = ss[ss.Length - 1].ToString();//邮编
phone = ss[ss.Length - 3].ToString();//电话
//string cn = match.ToString();
lblHTML.Text += cn + "<br/>" + prov + "<br/>" + address + "<br/>" + postcode + "<br/>" + phone + "<br/>";
//存储
OleDbConnection db = new OleDbConnection("Provider = Microsoft.Jet.OLEDB.4.0;Data Source=" + Server.MapPath("CatchResult.mdb"));
db.Open();
string str = "INSERT INTO tbResult (CN,PHONE,PROV,ADDRESS,POSTCODE) VALUES('"
+ cn + "','" + phone + "','" + prov + "','" + address + "','" + postcode + "')";
OleDbCommand cmd = new OleDbCommand(str, db);
cmd.ExecuteNonQuery();
cmd = null;
db.Close();
}
lblHTML.Text += "<br/><br/>第" + t + 1 + "页<br/><br/>";
}
}
}
</script>
<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
<title>内容抓取</title>
</head>
<body>
<form id="form1" runat="server">
<div>
<asp:Label runat=server ID="lblHTML" Rows="30" Cols="80" EnableViewState="false" Wrap="True"></asp:Label>
<br />
<br />
</div>
</form>
</body>
</html>