R语言-变量聚类
> ######变量聚类 > setwd("/Users/yaozhilin/Downloads/R_edu/data") > accepts<-read.csv("accepts.csv") > #导入ClustOfvar包——>用hclustvar对变量进行聚类及用stability确定聚类数量——>用 > #cutreevar把hclustvar的树参照stability数据进行切割 > library(ClustOfVar) > orgData<-accepts[,c(-1,-2,-3,-5,-6)] > tree<-hclustvar(orgData) > plot(tree)
st<-stability(tree,B=30)
> part <- cutreevar(tree,7,matsim = TRUE)#聚成7类查看 > print(part) Call: cutreevar(obj = tree, k = 7, matsim = TRUE) name description "$var" "list of variables in each cluster" "$sim" "similarity matrix in each cluster" "$cluster" "cluster memberships" "$wss" "within-cluster sum of squares" "$E" "gain in cohesion (in %)" "$size" "size of each cluster" "$scores" "synthetic score of each cluster" "$coef" "coef of the linear combinations defining the synthetic scores of each cluster" > summary(part) Call: cutreevar(obj = tree, k = 7, matsim = TRUE) Data: number of observations: 5845 number of variables: 19 number of clusters: 7 Cluster 1 : squared loading correlation purch_price 0.90 -0.95 loan_amt 0.90 -0.95 msrp 0.90 -0.95 vehicle_year 0.16 -0.40 Cluster 2 : squared loading correlation fico_score 0.69 0.84 tot_derog 0.60 -0.78 rev_util 0.31 -0.56 Cluster 3 : squared loading correlation tot_rev_tr 0.70 0.84 tot_rev_line 0.61 0.79 tot_rev_debt 0.61 0.79 tot_open_tr 0.61 0.83 tot_tr 0.55 0.74 age_oldest_tr 0.26 0.51 Cluster 4 : squared loading correlation down_pyt 0.73 0.85 ltv 0.73 -0.85 Cluster 5 : squared loading correlation 1 1 Cluster 6 : squared loading correlation 1 1 Cluster 7 : squared loading correlation used_ind 0.79 -0.89 veh_mileage 0.79 -0.89 Gain in cohesion (in %): 59.07
> part$sim #查看聚类后同类的相关性矩阵 $cluster1 vehicle_year purch_price msrp loan_amt vehicle_year 1.00000000 0.05961112 0.08725898 0.05578607 purch_price 0.05961112 1.00000000 0.74949104 0.80971082 msrp 0.08725898 0.74949104 1.00000000 0.75508338 loan_amt 0.05578607 0.80971082 0.75508338 1.00000000 $cluster2 tot_derog rev_util fico_score tot_derog 1.00000000 0.02360717 0.21185851 rev_util 0.02360717 1.00000000 0.06931344 fico_score 0.21185851 0.06931344 1.00000000 $cluster3 tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line tot_tr 1.0000000 0.22550267 0.2348436 0.19762456 0.14830133 0.3122191 age_oldest_tr 0.2255027 1.00000000 0.0301182 0.04633906 0.05585033 0.1761412 tot_open_tr 0.2348436 0.03011820 1.0000000 0.59139623 0.28310367 0.1651928 tot_rev_tr 0.1976246 0.04633906 0.5913962 1.00000000 0.43259981 0.2627265 tot_rev_debt 0.1483013 0.05585033 0.2831037 0.43259981 1.00000000 0.3411766 tot_rev_line 0.3122191 0.17614119 0.1651928 0.26272651 0.34117657 1.0000000 $cluster4 down_pyt ltv down_pyt 1.0000000 0.2086998 ltv 0.2086998 1.0000000 $cluster5 loan_term loan_term 1 $cluster6 tot_income tot_income 1 $cluster7 veh_mileage used_ind veh_mileage 1.0000000 0.3313023 used_ind 0.3313023 1.0000000 > part$var $cluster1 squared loading correlation purch_price 0.9047017 -0.9511581 loan_amt 0.9046381 -0.9511247 msrp 0.8983588 -0.9478394 vehicle_year 0.1562745 -0.3953246 $cluster2 squared loading correlation fico_score 0.6927142 0.8380766 tot_derog 0.6003257 -0.7763639 rev_util 0.3121919 -0.5587414 $cluster3 squared loading correlation tot_rev_tr 0.6994471 0.8409839 tot_rev_line 0.6125771 0.7859612 tot_rev_debt 0.6116821 0.7853868 tot_open_tr 0.6094718 0.8271140 tot_tr 0.5475506 0.7399666 age_oldest_tr 0.2552606 0.5052449 $cluster4 squared loading correlation down_pyt 0.7284184 0.8534743 ltv 0.7284184 -0.8534817 $cluster5 squared loading correlation 1 1 $cluster6 squared loading correlation 1 1 $cluster7 squared loading correlation used_ind 0.7877943 -0.8875778 veh_mileage 0.7877943 -0.8876091
> part <- cutreevar(tree,16,matsim = TRUE)#聚成16类查看 > print(part) Call: cutreevar(obj = tree, k = 16, matsim = TRUE) name description "$var" "list of variables in each cluster" "$sim" "similarity matrix in each cluster" "$cluster" "cluster memberships" "$wss" "within-cluster sum of squares" "$E" "gain in cohesion (in %)" "$size" "size of each cluster" "$scores" "synthetic score of each cluster" "$coef" "coef of the linear combinations defining the synthetic scores of each cluster" > summary(part) Call: cutreevar(obj = tree, k = 16, matsim = TRUE) Data: number of observations: 5845 number of variables: 19 number of clusters: 16 Cluster 1 : squared loading correlation 1 1 Cluster 2 : squared loading correlation 1 1 Cluster 3 : squared loading correlation 1 1 Cluster 4 : squared loading correlation 1 1 Cluster 5 : squared loading correlation tot_open_tr 0.88 0.96 tot_rev_tr 0.88 0.94 Cluster 6 : squared loading correlation 1 1 Cluster 7 : squared loading correlation 1 1 Cluster 8 : squared loading correlation 1 1 Cluster 9 : squared loading correlation 1 1 Cluster 10 : squared loading correlation loan_amt 0.93 0.96 purch_price 0.93 0.96 msrp 0.90 0.95 Cluster 11 : squared loading correlation 1 1 Cluster 12 : squared loading correlation 1 1 Cluster 13 : squared loading correlation 1 1 Cluster 14 : squared loading correlation 1 1 Cluster 15 : squared loading correlation 1 1 Cluster 16 : squared loading correlation 1 1 Gain in cohesion (in %): 96.85 > part$sim #查看相关性矩阵 $cluster1 vehicle_year vehicle_year 1 $cluster2 tot_derog tot_derog 1 $cluster3 tot_tr tot_tr 1 $cluster4 age_oldest_tr age_oldest_tr 1 $cluster5 tot_open_tr tot_rev_tr tot_open_tr 1.0000000 0.5913962 tot_rev_tr 0.5913962 1.0000000 $cluster6 tot_rev_debt tot_rev_debt 1 $cluster7 tot_rev_line tot_rev_line 1 $cluster8 rev_util rev_util 1 $cluster9 fico_score fico_score 1 $cluster10 purch_price msrp loan_amt purch_price 1.0000000 0.7494910 0.8097108 msrp 0.7494910 1.0000000 0.7550834 loan_amt 0.8097108 0.7550834 1.0000000 $cluster11 down_pyt down_pyt 1 $cluster12 loan_term loan_term 1 $cluster13 ltv ltv 1 $cluster14 tot_income tot_income 1 $cluster15 veh_mileage veh_mileage 1 $cluster16 used_ind used_ind 1 > part$var $cluster1 squared loading correlation 1 1 $cluster2 squared loading correlation 1 1 $cluster3 squared loading correlation 1 1 $cluster4 squared loading correlation 1 1 $cluster5 squared loading correlation tot_open_tr 0.8845115 0.9597241 tot_rev_tr 0.8845115 0.9404847 $cluster6 squared loading correlation 1 1 $cluster7 squared loading correlation 1 1 $cluster8 squared loading correlation 1 1 $cluster9 squared loading correlation 1 1 $cluster10 squared loading correlation loan_amt 0.9275256 0.9630813 purch_price 0.9253048 0.9619276 msrp 0.9036108 0.9506082 $cluster11 squared loading correlation 1 1 $cluster12 squared loading correlation 1 1 $cluster13 squared loading correlation 1 1 $cluster14 squared loading correlation 1 1 $cluster15 squared loading correlation 1 1 $cluster16 squared loading correlation 1 1