点滴积累【C#】---抓取页面中想要的数据
效果:
描述:此功能是抓取外国的一个检测PM2.5的网站。实时读取网站的数据,然后保存到数据库里面。每隔一小时刷新一次。
地址为:http://beijing.usembassy-china.org.cn/070109air.html
筛选后的地址为:http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&
思路:先抓取到页面的所有数据,保存到txt里面,再一行一行的读取txt,然后用split,substring截取到自己想要的数据,最后保存到数据库,在进行插入数据库的时候查看一下是否已经存在,如果不存在则插入。
代码:
using System; using System.Collections.Generic; using System.Configuration; using System.Data; using System.Data.SqlClient; using System.IO; //using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; //using System.Threading.Tasks; /******************************** * 创建人:青苹果 * 创建时间:2015-12-28 * 描述:获取美利坚合众国的 PM2.5 * ******************************/ namespace GetUSAData { class Program { //public static string GetURL = System.Configuration.ConfigurationSettings.AppSettings["GetURL"];//获取数据的地址 public static string GetURL = "http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&"; public static string txtURL = System.Configuration.ConfigurationSettings.AppSettings["txtURL"];//保存为txt文件的路径 public static string conn = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString(); static void Main(string[] args) { LoadGO(); } public static void LoadGO() { GetUSA(); List<string[]> getlist = Read(txtURL); //删除txt if (File.Exists(txtURL)) { //如果存在则删除 File.Delete(txtURL); } if (getlist.Count > 0) { for (int i = getlist.Count-1; i >-1; i--) { DateTime dtime = DateTime.Parse(getlist[i][0].ToString()); string getTime = dtime.ToString("yyyy-MM-dd HH:mm"); string controlTime = dtime.ToString("yyyy-MM-dd"); float LatestHourdata1 = float.Parse(getlist[i][2]); int LatestHourdata2 = Convert.ToInt32(getlist[i][3]); float Avgdata1 = 0; int Avgdata2 = 0; string Avgdata3 = getlist[i][4].ToString(); List<SqlParameter> listWhere = new List<SqlParameter>(); listWhere.Add(new SqlParameter("@strDatetime", controlTime)); string sqlSelect = @"SELECT count(*) as allcount,sum(LatestHourdata1) as LatestHourdata1_avg, sum(LatestHourdata2) as LatestHourdata2_avg FROM T_twitter where ([LatestHourdata1] is not null or [LatestHourdata2] is not null or [Avgdata1] is not null or [AvgData2] is not null) and CONVERT(varchar(100), [datetime], 23)=@strDatetime"; DataTable sumDT = ControlDB(sqlSelect, listWhere, "select"); //查询总和用于计算日均值 if (sumDT.Rows.Count > 0) { foreach (DataRow itemDR in sumDT.Rows) { int allcount = Convert.ToInt32(itemDR["allcount"].ToString()); //数据库中当前日期数量总和 if (allcount > 0) { if (itemDR["LatestHourdata1_avg"] != null) { Avgdata1 = float.Parse(itemDR["LatestHourdata1_avg"].ToString()); //数据库中LatestHourdata1_avg总和 Avgdata1 = (Avgdata1 + LatestHourdata1) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值 } if (itemDR["LatestHourdata2_avg"] != null) { Avgdata2 = Convert.ToInt32(itemDR["LatestHourdata2_avg"].ToString()); //数据库中LatestHourdata2_avg总和 Avgdata2 = (Avgdata2 + LatestHourdata2) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值 } //根据网站规则判断PM2.5的平均严重性 if (Avgdata2 >= 0 && Avgdata2 <= 50) { Avgdata3 = " Good (at 24-hour exposure at this level)"; } else if (Avgdata2 >= 51 && Avgdata2 <= 100) { Avgdata3 = " Moderate (at 24-hour exposure at this level)"; } else if (Avgdata2 >= 101 && Avgdata2 <= 150) { Avgdata3 = " Unhealthy for Sensitive Groups (at 24-hour exposure at this level)"; } else if (Avgdata2 >= 151 && Avgdata2 <= 200) { Avgdata3 = " Unhealthy (at 24-hour exposure at this level)"; } else if (Avgdata2 >= 201 && Avgdata2 <= 300) { Avgdata3 = " Very Unhealthy (at 24-hour exposure at this level)"; } else { Avgdata3 = " Hazardous (at 24-hour exposure at this level)"; } } else { Avgdata1 = LatestHourdata1; Avgdata2 = LatestHourdata2; } } } List<SqlParameter> pars = new List<SqlParameter>(); pars.Add(new SqlParameter("@whereDatetime", getTime)); pars.Add(new SqlParameter("@datetime", getTime)); pars.Add(new SqlParameter("@LatestHourdata1", LatestHourdata1)); pars.Add(new SqlParameter("@LatestHourdata2", LatestHourdata2)); pars.Add(new SqlParameter("@LatestHourdata3", getlist[i][4].ToString())); pars.Add(new SqlParameter("@Avgdata1", Avgdata1)); pars.Add(new SqlParameter("@Avgdata2", Avgdata2)); pars.Add(new SqlParameter("@Avgdata3", Avgdata3)); string sql = @"if not exists(select * from dbo.T_twitter where datetime=@whereDatetime) begin insert T_twitter (datetime,LatestHourdata1,LatestHourdata2,LatestHourdata3,Avgdata1,AvgData2,AvgData3) VALUES(@datetime,@LatestHourdata1,@LatestHourdata2,@LatestHourdata3,@Avgdata1,@Avgdata2,@Avgdata3) end"; ControlDB(sql, pars, "");//插入数据 } } } /// <summary> /// 获取页面数据保存至txt /// </summary> public static void GetUSA() { WebRequest request = WebRequest.Create(GetURL); WebResponse response = request.GetResponse(); StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")); //reader.ReadToEnd() 表示取得网页的源码 FileStream fs = new FileStream(txtURL, FileMode.Create); byte[] data = System.Text.Encoding.Default.GetBytes(reader.ReadToEnd()); //开始写入 fs.Write(data, 0, data.Length); //清空缓冲区、关闭流 fs.Flush(); fs.Close(); } /// <summary> /// 根据路径读取txt文件 /// </summary> /// <param name="path">txt路径</param> /// <returns></returns> public static List<string[]> Read(string path) { List<string[]> list = new List<string[]>(); StreamReader sr = new StreamReader(path, Encoding.Default); String line; while ((line = sr.ReadLine()) != null) { int i = line.ToString().IndexOf("title"); if (i > 0) { string titleStr = line.ToString().Substring(i + 7); //截取到title后面的值 string[] titlelist = titleStr.Split('"'); //以" 截取 string titledata = titlelist[0]; string[] datalist = titledata.Split('&'); //以& 截取 string data = datalist[0]; string[] datastrlist = data.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);//以; 截取 list.Add(datastrlist); } } sr.Close(); return list; } /// <summary> /// 增查表 /// </summary> /// <returns></returns> public static DataTable ControlDB(string sql, List<SqlParameter> par, string type) { DataAccess controData = new DataAccess(); DataTable resultDT = new DataTable(); if (type == "select") { resultDT = controData.GetDataTable(sql, par.ToArray()); } else { int result = controData.ExecuteSql(sql, par.ToArray()); } return resultDT; } } }
Demo下载: