.net网站数据抓取

最新项目需要抓取人民币汇率中间价的数据,所以就写了个简单的爬虫抓取数据。抓取的网站为:http://www.safe.gov.cn/wps/portal/sy/tjsj_hlzjj_inquire

#region 执行数据抓取(人民币汇率中间价)
/// <summary>
/// 执行数据抓取(人民币汇率中间价)
/// </summary>
public void CaptureData()
{
            isExecuting = true;
            StringBuilder msg = new StringBuilder();
            msg.AppendFormat("执行时间:{0}\r\n", DateTime.Now);
            msg.Append("开始抓取人民币汇率中间价...\r\n\r\n");
            SetLogging(msg.ToString());


            var db = new dbContext();
            var trans = db.Database.BeginTransaction();
            string title = "";
            DateTime dt = DateTime.Now;
            if (executeType == "true")
            {
                title += dt.ToString("yyyy-MM-dd") + "的数据抓取";
            }
            else
            {
                title += "时间范围为:开始时间为" + startTime + ",结束时间为" + endTime + "的数据抓取";
            }

            try
            {
                string url = "";
                string basePath = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action";
                if(executeType == "true")
                {
                    var date = DateTime.Now.ToString("yyyy-MM-dd");
                    url = basePath + "?projectBean.startDate=" + date + "&projectBean.endDate=" + date + "&queryYN=true";
                }
                else
                {
                    url = basePath + "?projectBean.startDate=" + startTime + "&projectBean.endDate=" + endTime + "&queryYN=true";
                }

                //string url = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action?projectBean.startDate=2017-03-15&projectBean.endDate=2017-03-15&queryYN=true";
                //发送请求
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Method = "GET";
                request.ProtocolVersion = HttpVersion.Version11;
                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
                request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                request.Timeout = 100000;  
                request.Headers.Add("Accept-Encoding", "gzip, deflate");
                request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
                request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
                request.CookieContainer = new CookieContainer();
                //接收请求
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream stream = response.GetResponseStream();
                string resultStr = ""; //返回字符串,若翻译失败则返回空字符串
                using (StreamReader reader = new StreamReader(stream, Encoding.GetEncoding("utf-8")))
                {
                    //开始解释结果
                    string result = reader.ReadToEnd();
                    if(!string.IsNullOrEmpty(result))
                    {
                        result = result.Replace("\n","");       //过滤\n转换为空
                        result = result.Replace("\r", "");      //过滤\r转换为空
                        result = result.Replace("\t", "");      //过滤\t转换为空
                        result = result.Replace("\\", "");      //过滤\转换为空                                                            
                        result = Regex.Replace(result, @"<!--(?s).*?-->", "", RegexOptions.IgnoreCase);     //过滤注释
                        result = result.Replace("&nbsp;", "");  //过滤nbsp标签
                        string tableHtml = Regex.Match(result, "<table class=\"list\" id=\"InfoTable\".*>.*</table>").ToString();
                        MatchCollection trHtmls = Regex.Matches(tableHtml, "<tr class=\"first\".*?>(.*?)</tr>");
                        foreach (Match tr in trHtmls)
                        {
                            #region 插入一条信息
                            Regex reg2 = new Regex("<td.*?>(.*?)</td>");
                            MatchCollection mc2 = reg2.Matches(tr.Value);
                            List<string> dataList = new List<string>();

                            Match[] marr = mc2.OfType<Match>().ToArray();
                            for(int i=0;i<marr.Length;i++)
                            {
                                var value = marr[i].Groups[1].Value;
                                dataList.Add(value);
                            }

                            var dateTime = Convert.ToDateTime(dataList[0]);
                            var item = db.RMB_EXCHANGERATE.Where(p => p.TIME == dateTime).FirstOrDefault();
                            if(item == null)
                            {
                                RMB_EXCHANGERATE obj = new RMB_EXCHANGERATE();
                                obj.ID = Guid.NewGuid().ToString();
                                obj.TIME = Convert.ToDateTime(dataList[0]);
                                obj.USD = Convert.ToDecimal(dataList[1]);
                                obj.EUR = Convert.ToDecimal(dataList[2]);
                                obj.JPY = Convert.ToDecimal(dataList[3]);
                                obj.HKD = Convert.ToDecimal(dataList[4]);
                                obj.GBP = Convert.ToDecimal(dataList[5]);
                                obj.MYR = Convert.ToDecimal(dataList[6]);
                                obj.SUR = Convert.ToDecimal(dataList[7]);
                                obj.ZAR = Convert.ToDecimal(dataList[8]);
                                obj.KRW = Convert.ToDecimal(dataList[9]);
                                obj.AED = Convert.ToDecimal(dataList[10]);
                                obj.SR = Convert.ToDecimal(dataList[11]);
                                obj.HUF = Convert.ToDecimal(dataList[12]);
                                obj.PLN = Convert.ToDecimal(dataList[13]);
                                obj.DKK = Convert.ToDecimal(dataList[14]);
                                obj.SEK = Convert.ToDecimal(dataList[15]);
                                obj.NOK = Convert.ToDecimal(dataList[16]);
                                obj.ITL = Convert.ToDecimal(dataList[17]);
                                obj.PHP = Convert.ToDecimal(dataList[18]);
                                obj.AUD = Convert.ToDecimal(dataList[19]);
                                obj.CAD = Convert.ToDecimal(dataList[20]);
                                obj.NZD = Convert.ToDecimal(dataList[21]);
                                obj.SGD = Convert.ToDecimal(dataList[22]);
                                obj.CHF = Convert.ToDecimal(dataList[23]);
                                obj.CREATETIME = DateTime.Now;
                                db.RMB_EXCHANGERATE.Add(obj);
                            }
                            #endregion
                        }

                        db.SaveChanges();
                        trans.Commit();

                        StringBuilder msg2 = new StringBuilder();
                        msg2.AppendFormat("执行时间:{0}\r\n", DateTime.Now);
                        msg2.AppendFormat("{0}成功\r\n\r\n",title);
                        SetLogging(msg2.ToString());
                    }
                    else
                    {
                        StringBuilder msg2 = new StringBuilder();
                        msg2.AppendFormat("执行时间:{0}\r\n", DateTime.Now);
                        msg2.AppendFormat("{0}为空\r\n\r\n\r\n",title);
                        SetLogging(msg2.ToString());
                    }                  
                }

                
                isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
            }
            catch (Exception ex)
            {
                trans.Rollback();
                var message = logTemplate2(ex, title+"失败");
                SetLogging(message);
                if (ex.Message == "请求超时")
                {
                    //循环抓取
                    CaptureData();
                }

                isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
            }
        }
        #endregion

 

 

posted @ 2017-09-11 10:38  潇洒飘过  阅读(760)  评论(0编辑  收藏  举报