.net网站数据抓取
最新项目需要抓取人民币汇率中间价的数据,所以就写了个简单的爬虫抓取数据。抓取的网站为:http://www.safe.gov.cn/wps/portal/sy/tjsj_hlzjj_inquire
#region 执行数据抓取(人民币汇率中间价) /// <summary> /// 执行数据抓取(人民币汇率中间价) /// </summary> public void CaptureData() { isExecuting = true; StringBuilder msg = new StringBuilder(); msg.AppendFormat("执行时间:{0}\r\n", DateTime.Now); msg.Append("开始抓取人民币汇率中间价...\r\n\r\n"); SetLogging(msg.ToString()); var db = new dbContext(); var trans = db.Database.BeginTransaction(); string title = ""; DateTime dt = DateTime.Now; if (executeType == "true") { title += dt.ToString("yyyy-MM-dd") + "的数据抓取"; } else { title += "时间范围为:开始时间为" + startTime + ",结束时间为" + endTime + "的数据抓取"; } try { string url = ""; string basePath = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action"; if(executeType == "true") { var date = DateTime.Now.ToString("yyyy-MM-dd"); url = basePath + "?projectBean.startDate=" + date + "&projectBean.endDate=" + date + "&queryYN=true"; } else { url = basePath + "?projectBean.startDate=" + startTime + "&projectBean.endDate=" + endTime + "&queryYN=true"; } //string url = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action?projectBean.startDate=2017-03-15&projectBean.endDate=2017-03-15&queryYN=true"; //发送请求 HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); request.Method = "GET"; request.ProtocolVersion = HttpVersion.Version11; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; request.Timeout = 100000; request.Headers.Add("Accept-Encoding", "gzip, deflate"); request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8"); request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3"); request.CookieContainer = new CookieContainer(); //接收请求 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); string resultStr = ""; //返回字符串,若翻译失败则返回空字符串 using (StreamReader reader = new StreamReader(stream, Encoding.GetEncoding("utf-8"))) { //开始解释结果 string result = reader.ReadToEnd(); if(!string.IsNullOrEmpty(result)) { result = result.Replace("\n",""); //过滤\n转换为空 result = result.Replace("\r", ""); //过滤\r转换为空 result = result.Replace("\t", ""); //过滤\t转换为空 result = result.Replace("\\", ""); //过滤\转换为空 result = Regex.Replace(result, @"<!--(?s).*?-->", "", RegexOptions.IgnoreCase); //过滤注释 result = result.Replace(" ", ""); //过滤nbsp标签 string tableHtml = Regex.Match(result, "<table class=\"list\" id=\"InfoTable\".*>.*</table>").ToString(); MatchCollection trHtmls = Regex.Matches(tableHtml, "<tr class=\"first\".*?>(.*?)</tr>"); foreach (Match tr in trHtmls) { #region 插入一条信息 Regex reg2 = new Regex("<td.*?>(.*?)</td>"); MatchCollection mc2 = reg2.Matches(tr.Value); List<string> dataList = new List<string>(); Match[] marr = mc2.OfType<Match>().ToArray(); for(int i=0;i<marr.Length;i++) { var value = marr[i].Groups[1].Value; dataList.Add(value); } var dateTime = Convert.ToDateTime(dataList[0]); var item = db.RMB_EXCHANGERATE.Where(p => p.TIME == dateTime).FirstOrDefault(); if(item == null) { RMB_EXCHANGERATE obj = new RMB_EXCHANGERATE(); obj.ID = Guid.NewGuid().ToString(); obj.TIME = Convert.ToDateTime(dataList[0]); obj.USD = Convert.ToDecimal(dataList[1]); obj.EUR = Convert.ToDecimal(dataList[2]); obj.JPY = Convert.ToDecimal(dataList[3]); obj.HKD = Convert.ToDecimal(dataList[4]); obj.GBP = Convert.ToDecimal(dataList[5]); obj.MYR = Convert.ToDecimal(dataList[6]); obj.SUR = Convert.ToDecimal(dataList[7]); obj.ZAR = Convert.ToDecimal(dataList[8]); obj.KRW = Convert.ToDecimal(dataList[9]); obj.AED = Convert.ToDecimal(dataList[10]); obj.SR = Convert.ToDecimal(dataList[11]); obj.HUF = Convert.ToDecimal(dataList[12]); obj.PLN = Convert.ToDecimal(dataList[13]); obj.DKK = Convert.ToDecimal(dataList[14]); obj.SEK = Convert.ToDecimal(dataList[15]); obj.NOK = Convert.ToDecimal(dataList[16]); obj.ITL = Convert.ToDecimal(dataList[17]); obj.PHP = Convert.ToDecimal(dataList[18]); obj.AUD = Convert.ToDecimal(dataList[19]); obj.CAD = Convert.ToDecimal(dataList[20]); obj.NZD = Convert.ToDecimal(dataList[21]); obj.SGD = Convert.ToDecimal(dataList[22]); obj.CHF = Convert.ToDecimal(dataList[23]); obj.CREATETIME = DateTime.Now; db.RMB_EXCHANGERATE.Add(obj); } #endregion } db.SaveChanges(); trans.Commit(); StringBuilder msg2 = new StringBuilder(); msg2.AppendFormat("执行时间:{0}\r\n", DateTime.Now); msg2.AppendFormat("{0}成功\r\n\r\n",title); SetLogging(msg2.ToString()); } else { StringBuilder msg2 = new StringBuilder(); msg2.AppendFormat("执行时间:{0}\r\n", DateTime.Now); msg2.AppendFormat("{0}为空\r\n\r\n\r\n",title); SetLogging(msg2.ToString()); } } isExecuting = false; //无论执行成功还是失败,完成后都要恢复状态 } catch (Exception ex) { trans.Rollback(); var message = logTemplate2(ex, title+"失败"); SetLogging(message); if (ex.Message == "请求超时") { //循环抓取 CaptureData(); } isExecuting = false; //无论执行成功还是失败,完成后都要恢复状态 } } #endregion