SAS | 数据EDA及代码

先构造一个数据集,代码如下:

x libname zdata "D:\data\python_file\test";*先构造一个数据集;
data Transactions;
informat TransDate yymmdd10.;
format TransDate yymmdd10.;
input CustomerID  TransDate  Amount  AccountType  $;
datalines;
1    2008-01-01     114.56      Savings  
1	 2008-02-01	 	56.78	 	Checking 
1	 2008-03-01	 	359.31	 	Savings 
1	 2008-01-04	 	89.56		Checking 
1	 2008-05-01	 	1000.00	 	Savings  
1	 2008-08-01	 	1200.00 	Checking 
2	 2008-01-02	 	122.51	 	Savings  
2	 2008-12-01	 	42.07		Checking 
2	 2008-11-01	 	146.30	 	Savings  
2	 2008-07-13	 	1254.48 	Checking 
2	 2008-02-11	 	400.00	 	Savings	  
2	 2008-01-01	 	500.00	 	Checking 
;
run;

然后对数据集进行EDA分析,过程和代码如下:

/*1.全表统计*/
proc contents data=zdata.Transactions;
run;

/*2.数据量统计*/
/*常用于比较总记录数和去重主键数,看主键是否唯一*/
proc sql;
create table a as select count(*) as count1,count(distinct CustomerID) as count2 
from zdata.Transactions;
quit;

/*3.名义变量统计*/
/*该语句用于对数据集中名义变量进行频数统计*/
proc freq data=zdata.Transactions;
tables 
	CustomerID 
	AccountType 
	TransDate 
	CustomerID*AccountType			/*交叉分布*/
;
format 
	TransDate yyq6.				/*yyq6.指按季度统计、yymm6.指按月统计*/
;
run;

/*4.数值变量统计*/
proc means data=zdata.Transactions N NMISS P1 P10 P25 P50 P75 P90 MIN MAX MEAN STD;
var Amount;
by CustomerID;
run;

proc means data=zdata.Transactions noprint ;
putput out=d SUM(Amount)=Amount1 MIN(Amount)=Amount2;
BY CustomerID;
run;

/*5.排序*/
proc sort data=zdata.Transactions;
by CustomerID;				    /*按照CustomerID排序,默认升序*/
run;	 

proc sort data=zdata.Transactions;
by Amount descending CustomerID;	    /*按照Amount升序,CustomerID降序排序*/
run;

/*6.set:多个数据集纵向连接*/
data zdata.Transactions_11;
set zdata.Transactions;
keep CustomerID Amount;
run;

/*7.merge:多个数据集横向连接*/
data h1;
set b1;
if CustomerID=1;
run;

proc sort data=Transactions;
by CustomerID;
run;

proc sort data=h1;
by CustomerID;
run;

data h;
merge  Transactions h1 ;		/*数据集横向连接*/
by CustomerID;				/*merge只要再BY某个变量连接时,才需要先排序*/	
run;

*注意:merge只有在限定条件时,才等价于SQL中的left join/right join;
data i;
merge Transactions(in=a)  h1(in=b);
by CustomerID;
if b;		                        /*仅保留h1数据集中有的id*/
run;

*等价于上面限定条件的merge;
proc sql;
create table i as 
select * from h1 as tb1 left join Transactions as tb2 on tb1.CustomerID=tb2.CustomerID;
quit;

/*8.数据集导出*/
proc export data=i outfile='路径\test.csv' dbms=csv replace;run;

posted @ 2020-03-23 11:23  1k-yang  阅读(755)  评论(0编辑  收藏  举报