wsky's blog,Record my technical life

just coding

导航

最近写的一个抓取网页内容(抓百度的)

找了个兼职开发的工作,这个算是个小任务,数据采集之前没有实际做过,这段代码效率不是很高..不过勉强算是写出来撒 汗~

代码如下:
<%@ Page Language="C#" Debug="true" %>
<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>
<%@ Import Namespace="System.Text.RegularExpressions" %>
<%@ Import Namespace="System.Data.OleDb" %>
<%@ Import Namespace="System.Web" %>
<%@ Import Namespace="System.Web.UI" %>
<%@ Import Namespace="System.Web.UI.WebControls" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<script runat="server">
    void InitializeCommapent()
    {
        this.Page.Load += new EventHandler(Page_Load);
    }
    protected override void OnInit(EventArgs e)
    {
        InitializeCommapent();
        base.OnInit(e);
    }
   
    void Page_Load(object sender, EventArgs e)
    {
        if (!IsPostBack)
        {
           
         

            int[] pn=new int[10];//页标签
            int pnn = 0;
            for (int i = 0; i < 10; i++)
            {
                if (pnn >= 100)
                {
                    int pnn2 = pnn;
                    pn[i] = pnn2/10;
                }
                else
                    pn[i] = pnn;
                pnn += 20;
            }
           
            lblHTML.Text = "";
           
            for (int t = 0; t < pn.Length; t++)
            {
                WebRequest req = WebRequest.Create("http://yp.baidu.com/m?word=%BE%C6%B5%EA&tn=baiduyp&ct=553648128&rn=20&lm=-1&z=-1&pn="+pn[t]);
                string strResult = "";

                try
                {
                    WebResponse result = req.GetResponse();
                    Stream ReceiveStream = result.GetResponseStream();
                    //StreamReader sr=new StreamReader()
                    Byte[] read = new Byte[512];
                    int bytes = ReceiveStream.Read(read, 0, 512);

                   
                  
                    while (bytes > 0)
                    {

                   
                        Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
                        //lblHTML.Text = lblHTML.Text + encode.GetString(read, 0, bytes);
                        strResult += encode.GetString(read, 0, bytes);
                        bytes = ReceiveStream.Read(read, 0, 512);

                    }
                }
                catch (Exception)
                {
                    lblHTML.Text = "检索页时出错";
                }
               
                int istart = strResult.IndexOf("<p>", 0);
                int iend = strResult.IndexOf("</p>", istart);
                strResult = strResult.Substring(istart, iend - istart + 20);
                Regex re = new Regex(@"style=margin-left:18px&gt;([\w\W]*?)&lt;br&gt;&lt;p");//遗漏最后一项
                MatchCollection Results = re.Matches(Server.HtmlEncode(strResult));



                foreach (Match match in Results)
                {
                    string cn = Regex.Match(match.ToString(), @"&lt;font class=style11&gt;([\w\W]*?)&lt;", RegexOptions.IgnoreCase).ToString();
                    cn = Regex.Match(cn, @"&gt;([\w\W]*?)&lt;", RegexOptions.IgnoreCase).ToString().Replace("&gt;", "").Replace("&lt;", "") + "酒店";//名称
                  
                    string prov = Regex.Match(match.ToString(), @"&lt;font class=f14&gt;([\w\W]*?)&lt;", RegexOptions.IgnoreCase).ToString();
                    prov = Regex.Match(prov, @"&gt;([\w\W]*?)&lt;", RegexOptions.IgnoreCase).ToString().Replace("&gt;", "").Replace("&lt;", "");
                    string[] p = prov.Split(' ');
                    prov = p[0].ToString();//省
                    string address = p[3].ToString();//地址

                    string postcode = Regex.Match(match.ToString(), @"&lt;br&gt;([\w\W]*?)&lt;/font&gt;").ToString();
                    string[] ss = postcode.Split(' ');
                    string phone = "";

                    postcode = ss[ss.Length - 1].ToString();//邮编
                    phone = ss[ss.Length - 3].ToString();//电话

                    //string cn = match.ToString();
                    lblHTML.Text += cn + "<br/>" + prov + "<br/>" + address + "<br/>" + postcode + "<br/>" + phone + "<br/>";
                    //存储
                    OleDbConnection db = new OleDbConnection("Provider = Microsoft.Jet.OLEDB.4.0;Data Source=" + Server.MapPath("CatchResult.mdb"));

                    db.Open();
                    string str = "INSERT INTO tbResult (CN,PHONE,PROV,ADDRESS,POSTCODE) VALUES('"
                        + cn + "','" + phone + "','" + prov + "','" + address + "','" + postcode + "')";
                    OleDbCommand cmd = new OleDbCommand(str, db);
                    cmd.ExecuteNonQuery();
                    cmd = null;
                    db.Close();

                }

                lblHTML.Text += "<br/><br/>第" + t + 1 + "页<br/><br/>";
            }
          
        }
    }
</script>

<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
    <title>内容抓取</title>
</head>
<body>
    <form id="form1" runat="server">
    <div>
    <asp:Label runat=server ID="lblHTML" Rows="30" Cols="80" EnableViewState="false"  Wrap="True"></asp:Label>&nbsp;
        <br />
        <br />
      </div>
    </form>
</body>
</html>


posted on 2007-10-08 19:08  wsky  阅读(2551)  评论(2编辑  收藏  举报