SAS | 数据EDA及代码
先构造一个数据集,代码如下:
x libname zdata "D:\data\python_file\test";*先构造一个数据集;
data Transactions;
informat TransDate yymmdd10.;
format TransDate yymmdd10.;
input CustomerID TransDate Amount AccountType $;
datalines;
1 2008-01-01 114.56 Savings
1 2008-02-01 56.78 Checking
1 2008-03-01 359.31 Savings
1 2008-01-04 89.56 Checking
1 2008-05-01 1000.00 Savings
1 2008-08-01 1200.00 Checking
2 2008-01-02 122.51 Savings
2 2008-12-01 42.07 Checking
2 2008-11-01 146.30 Savings
2 2008-07-13 1254.48 Checking
2 2008-02-11 400.00 Savings
2 2008-01-01 500.00 Checking
;
run;
然后对数据集进行EDA分析,过程和代码如下:
/*1.全表统计*/
proc contents data=zdata.Transactions;
run;
/*2.数据量统计*/
/*常用于比较总记录数和去重主键数,看主键是否唯一*/
proc sql;
create table a as select count(*) as count1,count(distinct CustomerID) as count2
from zdata.Transactions;
quit;
/*3.名义变量统计*/
/*该语句用于对数据集中名义变量进行频数统计*/
proc freq data=zdata.Transactions;
tables
CustomerID
AccountType
TransDate
CustomerID*AccountType /*交叉分布*/
;
format
TransDate yyq6. /*yyq6.指按季度统计、yymm6.指按月统计*/
;
run;
/*4.数值变量统计*/
proc means data=zdata.Transactions N NMISS P1 P10 P25 P50 P75 P90 MIN MAX MEAN STD;
var Amount;
by CustomerID;
run;
proc means data=zdata.Transactions noprint ;
putput out=d SUM(Amount)=Amount1 MIN(Amount)=Amount2;
BY CustomerID;
run;
/*5.排序*/
proc sort data=zdata.Transactions;
by CustomerID; /*按照CustomerID排序,默认升序*/
run;
proc sort data=zdata.Transactions;
by Amount descending CustomerID; /*按照Amount升序,CustomerID降序排序*/
run;
/*6.set:多个数据集纵向连接*/
data zdata.Transactions_11;
set zdata.Transactions;
keep CustomerID Amount;
run;
/*7.merge:多个数据集横向连接*/
data h1;
set b1;
if CustomerID=1;
run;
proc sort data=Transactions;
by CustomerID;
run;
proc sort data=h1;
by CustomerID;
run;
data h;
merge Transactions h1 ; /*数据集横向连接*/
by CustomerID; /*merge只要再BY某个变量连接时,才需要先排序*/
run;
*注意:merge只有在限定条件时,才等价于SQL中的left join/right join;
data i;
merge Transactions(in=a) h1(in=b);
by CustomerID;
if b; /*仅保留h1数据集中有的id*/
run;
*等价于上面限定条件的merge;
proc sql;
create table i as
select * from h1 as tb1 left join Transactions as tb2 on tb1.CustomerID=tb2.CustomerID;
quit;
/*8.数据集导出*/
proc export data=i outfile='路径\test.csv' dbms=csv replace;run;