页面抓取

        protected void Button1_Click(object sender, EventArgs e)
        {

            byte[] b = new byte[39824];
            string url = this.TextBox1.Text.Trim();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            Stream streamReceive = response.GetResponseStream();  
           
            Encoding encoding = Encoding.GetEncoding("UTF-8");  
            StreamReader streamReader = new StreamReader(streamReceive, encoding);  
            string strResult = streamReader.ReadToEnd();  
            streamReader.Close();

            string aa= stripHtml(strResult).Replace("\n","");           
            string strNO = cutstr(aa, "注册号:", "企业类型");
            string strType = cutstr(aa, "企业类型:", "企业分类");
            string strMain = cutstr(aa, "主体名称:", "法定代表人");
            string strPople = cutstr(aa, "法定代表人/负责人:", "行政区划");
            string strArea = cutstr(aa, "行政区划:", "成立日期");
            string strDate = cutstr(aa, "成立日期:", "注册资本");
            string strMoney = cutstr(aa, "注册资本:", "经营期限自");
            string strStart = cutstr(aa, "经营期限自:", "经营期限至");
            string strOver = cutstr(aa, "经营期限至:", "登记机关");
            string strRegistration = cutstr(aa, "登记机关:", "企业状态");
            string strStates = cutstr(aa, "企业状态:", "地址/住所");
            string strAddress = cutstr(aa, "地址/住所:", "经营范围");
            string strRange = cutstr(aa, "经营范围:", "许可经营范围");
            string strCheck = cutstr(aa, "年检年度:", "年检结果");
            string straaa = cutstr(aa, "年检结果:", "公示信息");
            
            //导出EXCEL
            string fPath = @"E:\website\模板.xlsx";
            string mystring = "Provider=Microsoft.ACE.OLEDB.12.0;Extended Properties='Excel 12.0;HDR=YES';data source=" + fPath;//读EXCEL2003/excel2007
            OleDbConnection cnnxls = new OleDbConnection(mystring);
            cnnxls.Open();
            string sql = "insert into [Sheet1$]  (功能区编号,街道编号,企业名称,法人代表,工商注册号,企业性质,注册资金(万元),币种,注册时间,所属行业)values('1','1','" + strMain + "','" + strPople + "','" + strNO + "','" + strType + "','" + strMoney + "','" + "人民币" + "','" + strDate + "','" + strRange + "')";
            OleDbCommand command = new OleDbCommand(sql, cnnxls);
            command.ExecuteNonQuery();
            cnnxls.Close();
        }
        //取出所有html标签
        private string stripHtml(string strHtml)
        {
            Regex objRegExp = new Regex("<(.|\n)+?>");
            string strOutput = objRegExp.Replace(strHtml, "");
            strOutput = strOutput.Replace("<", "&lt;");
            strOutput = strOutput.Replace(">", "&gt;");
            return strOutput;

        }
        //采集字符
        protected string cutstr(string str, string bs, string es)
        {
            //函数cutstr
            //功能切割字符串
            //参数说明str,需要切割的字符串,bs,开始字符串,es,结束字符串
            //由于我们要使用正则,所以要添加引用using System.Text.RegularExpressions;
            Match tempstr = Regex.Match(str, "(" + bs + ").+?(" + es + ")");
            //这是很普通的正则,我也解释不明白....
            string temp2 = tempstr.ToString();
            if (temp2.Length != 0)
            {
                string temp3 = temp2.Substring(bs.Length, temp2.Length - bs.Length - es.Length);
                //此次将取得的字符串的开始结束字符串去掉
                return temp3;
            }
            else
            {
                return "采集失败!!!";
            }
            //OK,此段看完可以返回page_load继续了...
        }
posted @ 2009-08-27 13:12  清凉的风/2.0  阅读(261)  评论(0编辑  收藏  举报