我的新浪博客 我的视频制作室 我的QQ空间

点滴积累【C#】---抓取页面中想要的数据

效果

描述:此功能是抓取外国的一个检测PM2.5的网站。实时读取网站的数据,然后保存到数据库里面。每隔一小时刷新一次。

地址为:http://beijing.usembassy-china.org.cn/070109air.html

筛选后的地址为:http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&

思路:先抓取到页面的所有数据,保存到txt里面,再一行一行的读取txt,然后用split,substring截取到自己想要的数据,最后保存到数据库,在进行插入数据库的时候查看一下是否已经存在,如果不存在则插入。

代码

using System;
using System.Collections.Generic;
using System.Configuration;
using System.Data;
using System.Data.SqlClient;
using System.IO;
//using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
//using System.Threading.Tasks;

/********************************
 * 创建人:青苹果
 * 创建时间:2015-12-28
 * 描述:获取美利坚合众国的 PM2.5
 * ******************************/

namespace GetUSAData
{
    class Program
    {
        //public static string GetURL = System.Configuration.ConfigurationSettings.AppSettings["GetURL"];//获取数据的地址
        public static string GetURL = "http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&";
        public static string txtURL = System.Configuration.ConfigurationSettings.AppSettings["txtURL"];//保存为txt文件的路径
        public static string conn = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();

        static void Main(string[] args)
        {

            LoadGO();
        }

        public static void LoadGO()
        {
            GetUSA();
            List<string[]> getlist = Read(txtURL);
            //删除txt
            if (File.Exists(txtURL))
            {
                //如果存在则删除
                File.Delete(txtURL);
            }
            if (getlist.Count > 0)
            {
                for (int i = getlist.Count-1; i >-1; i--)
                {
                    DateTime dtime = DateTime.Parse(getlist[i][0].ToString());
                    string getTime = dtime.ToString("yyyy-MM-dd HH:mm");
                    string controlTime = dtime.ToString("yyyy-MM-dd");
                    float LatestHourdata1 = float.Parse(getlist[i][2]);
                    int LatestHourdata2 = Convert.ToInt32(getlist[i][3]);
                    float Avgdata1 = 0;
                    int Avgdata2 = 0;
                    string Avgdata3 = getlist[i][4].ToString();

                    List<SqlParameter> listWhere = new List<SqlParameter>();
                    listWhere.Add(new SqlParameter("@strDatetime", controlTime));
                    string sqlSelect = @"SELECT count(*) as allcount,sum(LatestHourdata1) as LatestHourdata1_avg, sum(LatestHourdata2) as LatestHourdata2_avg
 FROM T_twitter  where ([LatestHourdata1] is not null
 or [LatestHourdata2] is not null or [Avgdata1] is not null
  or [AvgData2] is not null) and   CONVERT(varchar(100), [datetime], 23)=@strDatetime";

                    DataTable sumDT = ControlDB(sqlSelect, listWhere, "select");    //查询总和用于计算日均值
                    if (sumDT.Rows.Count > 0)
                    {
                        foreach (DataRow itemDR in sumDT.Rows)
                        {
                            int allcount = Convert.ToInt32(itemDR["allcount"].ToString());    //数据库中当前日期数量总和
                            if (allcount > 0)
                            {
                                if (itemDR["LatestHourdata1_avg"] != null)
                                {
                                    Avgdata1 = float.Parse(itemDR["LatestHourdata1_avg"].ToString());   //数据库中LatestHourdata1_avg总和
                                    Avgdata1 = (Avgdata1 + LatestHourdata1) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
                                }
                                if (itemDR["LatestHourdata2_avg"] != null)
                                {
                                    Avgdata2 = Convert.ToInt32(itemDR["LatestHourdata2_avg"].ToString());   //数据库中LatestHourdata2_avg总和
                                    Avgdata2 = (Avgdata2 + LatestHourdata2) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
                                }
                                //根据网站规则判断PM2.5的平均严重性

                                if (Avgdata2 >= 0 && Avgdata2 <= 50)
                                {
                                    Avgdata3 = " Good (at 24-hour exposure at this level)";
                                }
                                else if (Avgdata2 >= 51 && Avgdata2 <= 100)
                                {
                                    Avgdata3 = " Moderate (at 24-hour exposure at this level)";
                                }
                                else if (Avgdata2 >= 101 && Avgdata2 <= 150)
                                {
                                    Avgdata3 = " Unhealthy for Sensitive Groups (at 24-hour exposure at this level)";
                                }
                                else if (Avgdata2 >= 151 && Avgdata2 <= 200)
                                {
                                    Avgdata3 = " Unhealthy (at 24-hour exposure at this level)";
                                }
                                else if (Avgdata2 >= 201 && Avgdata2 <= 300)
                                {
                                    Avgdata3 = " Very Unhealthy (at 24-hour exposure at this level)";
                                }
                                else
                                {
                                    Avgdata3 = " Hazardous (at 24-hour exposure at this level)";
                                }
                            }
                            else
                            {
                                Avgdata1 = LatestHourdata1;
                                Avgdata2 = LatestHourdata2;
                            }
                        }
                    }

                    List<SqlParameter> pars = new List<SqlParameter>();
                    pars.Add(new SqlParameter("@whereDatetime", getTime));
                    pars.Add(new SqlParameter("@datetime", getTime));
                    pars.Add(new SqlParameter("@LatestHourdata1", LatestHourdata1));
                    pars.Add(new SqlParameter("@LatestHourdata2", LatestHourdata2));
                    pars.Add(new SqlParameter("@LatestHourdata3", getlist[i][4].ToString()));
                    pars.Add(new SqlParameter("@Avgdata1", Avgdata1));
                    pars.Add(new SqlParameter("@Avgdata2", Avgdata2));
                    pars.Add(new SqlParameter("@Avgdata3", Avgdata3));

                    string sql = @"if not exists(select * from  dbo.T_twitter where  datetime=@whereDatetime) begin
insert T_twitter (datetime,LatestHourdata1,LatestHourdata2,LatestHourdata3,Avgdata1,AvgData2,AvgData3)
VALUES(@datetime,@LatestHourdata1,@LatestHourdata2,@LatestHourdata3,@Avgdata1,@Avgdata2,@Avgdata3) end";
                    ControlDB(sql, pars, "");//插入数据
                }
            }
        }

        /// <summary>
        /// 获取页面数据保存至txt
        /// </summary>
        public static void GetUSA()
        {
            WebRequest request = WebRequest.Create(GetURL);
            WebResponse response = request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
            //reader.ReadToEnd() 表示取得网页的源码

            FileStream fs = new FileStream(txtURL, FileMode.Create);
            byte[] data = System.Text.Encoding.Default.GetBytes(reader.ReadToEnd());
            //开始写入
            fs.Write(data, 0, data.Length);
            //清空缓冲区、关闭流
            fs.Flush();
            fs.Close();
        }

        /// <summary>
        /// 根据路径读取txt文件
        /// </summary>
        /// <param name="path">txt路径</param>
        /// <returns></returns>
        public static List<string[]> Read(string path)
        {
            List<string[]> list = new List<string[]>();
            StreamReader sr = new StreamReader(path, Encoding.Default);
            String line;
            while ((line = sr.ReadLine()) != null)
            {
                int i = line.ToString().IndexOf("title");
                if (i > 0)
                {
                    string titleStr = line.ToString().Substring(i + 7); //截取到title后面的值
                    string[] titlelist = titleStr.Split('"');        //以"  截取
                    string titledata = titlelist[0];
                    string[] datalist = titledata.Split('&');  //以& 截取
                    string data = datalist[0];
                    string[] datastrlist = data.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);//以; 截取
                    list.Add(datastrlist);
                }
            }
            sr.Close();
            return list;
        }

        /// <summary>
        /// 增查表
        /// </summary>
        /// <returns></returns>
        public static DataTable ControlDB(string sql, List<SqlParameter> par, string type)
        {
            DataAccess controData = new DataAccess();
            DataTable resultDT = new DataTable();
            if (type == "select")
            {
                resultDT = controData.GetDataTable(sql, par.ToArray());
            }
            else
            {
                int result = controData.ExecuteSql(sql, par.ToArray());
            }
            return resultDT;
        }
    }
}

 Demo下载:

 https://files.cnblogs.com/files/xinchun/GetUSAData.zip

posted @ 2015-12-30 22:57  青苹果  阅读(944)  评论(0编辑  收藏  举报