R语言-变量聚类

> ######变量聚类
> setwd("/Users/yaozhilin/Downloads/R_edu/data")
> accepts<-read.csv("accepts.csv")
> #导入ClustOfvar包——>用hclustvar对变量进行聚类及用stability确定聚类数量——>用
> #cutreevar把hclustvar的树参照stability数据进行切割
> library(ClustOfVar)
> orgData<-accepts[,c(-1,-2,-3,-5,-6)]
> tree<-hclustvar(orgData)
> plot(tree)

 

 

 

st<-stability(tree,B=30)

 

 

> part <- cutreevar(tree,7,matsim = TRUE)#聚成7类查看
> print(part)

Call:
cutreevar(obj = tree, k = 7, matsim = TRUE)



 name       description                                                                    
 "$var"     "list of variables in each cluster"                                            
 "$sim"     "similarity matrix in each cluster"                                            
 "$cluster" "cluster memberships"                                                          
 "$wss"     "within-cluster sum of squares"                                                
 "$E"       "gain in cohesion (in %)"                                                      
 "$size"    "size of each cluster"                                                         
 "$scores"  "synthetic score of each cluster"                                              
 "$coef"    "coef of the linear combinations defining the synthetic scores of each cluster"
> summary(part)

Call:
cutreevar(obj = tree, k = 7, matsim = TRUE)



Data: 
   number of observations:  5845
   number of variables:  19
   number of clusters:  7

Cluster  1 : 
             squared loading correlation
purch_price             0.90       -0.95
loan_amt                0.90       -0.95
msrp                    0.90       -0.95
vehicle_year            0.16       -0.40


Cluster  2 : 
           squared loading correlation
fico_score            0.69        0.84
tot_derog             0.60       -0.78
rev_util              0.31       -0.56


Cluster  3 : 
              squared loading correlation
tot_rev_tr               0.70        0.84
tot_rev_line             0.61        0.79
tot_rev_debt             0.61        0.79
tot_open_tr              0.61        0.83
tot_tr                   0.55        0.74
age_oldest_tr            0.26        0.51


Cluster  4 : 
         squared loading correlation
down_pyt            0.73        0.85
ltv                 0.73       -0.85


Cluster  5 : 
squared loading     correlation 
              1               1 


Cluster  6 : 
squared loading     correlation 
              1               1 


Cluster  7 : 
            squared loading correlation
used_ind               0.79       -0.89
veh_mileage            0.79       -0.89


Gain in cohesion (in %):  59.07

 

> part$sim                            #查看聚类后同类的相关性矩阵
$cluster1
             vehicle_year purch_price       msrp   loan_amt
vehicle_year   1.00000000  0.05961112 0.08725898 0.05578607
purch_price    0.05961112  1.00000000 0.74949104 0.80971082
msrp           0.08725898  0.74949104 1.00000000 0.75508338
loan_amt       0.05578607  0.80971082 0.75508338 1.00000000

$cluster2
            tot_derog   rev_util fico_score
tot_derog  1.00000000 0.02360717 0.21185851
rev_util   0.02360717 1.00000000 0.06931344
fico_score 0.21185851 0.06931344 1.00000000

$cluster3
                 tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line
tot_tr        1.0000000    0.22550267   0.2348436 0.19762456   0.14830133    0.3122191
age_oldest_tr 0.2255027    1.00000000   0.0301182 0.04633906   0.05585033    0.1761412
tot_open_tr   0.2348436    0.03011820   1.0000000 0.59139623   0.28310367    0.1651928
tot_rev_tr    0.1976246    0.04633906   0.5913962 1.00000000   0.43259981    0.2627265
tot_rev_debt  0.1483013    0.05585033   0.2831037 0.43259981   1.00000000    0.3411766
tot_rev_line  0.3122191    0.17614119   0.1651928 0.26272651   0.34117657    1.0000000

$cluster4
          down_pyt       ltv
down_pyt 1.0000000 0.2086998
ltv      0.2086998 1.0000000

$cluster5
          loan_term
loan_term         1

$cluster6
           tot_income
tot_income          1

$cluster7
            veh_mileage  used_ind
veh_mileage   1.0000000 0.3313023
used_ind      0.3313023 1.0000000

> part$var   
$cluster1
             squared loading correlation
purch_price        0.9047017  -0.9511581
loan_amt           0.9046381  -0.9511247
msrp               0.8983588  -0.9478394
vehicle_year       0.1562745  -0.3953246

$cluster2
           squared loading correlation
fico_score       0.6927142   0.8380766
tot_derog        0.6003257  -0.7763639
rev_util         0.3121919  -0.5587414

$cluster3
              squared loading correlation
tot_rev_tr          0.6994471   0.8409839
tot_rev_line        0.6125771   0.7859612
tot_rev_debt        0.6116821   0.7853868
tot_open_tr         0.6094718   0.8271140
tot_tr              0.5475506   0.7399666
age_oldest_tr       0.2552606   0.5052449

$cluster4
         squared loading correlation
down_pyt       0.7284184   0.8534743
ltv            0.7284184  -0.8534817

$cluster5
squared loading     correlation 
              1               1 

$cluster6
squared loading     correlation 
              1               1 

$cluster7
            squared loading correlation
used_ind          0.7877943  -0.8875778
veh_mileage       0.7877943  -0.8876091

 

> part <- cutreevar(tree,16,matsim = TRUE)#聚成16类查看
> print(part)

Call:
cutreevar(obj = tree, k = 16, matsim = TRUE)



 name       description                                                                    
 "$var"     "list of variables in each cluster"                                            
 "$sim"     "similarity matrix in each cluster"                                            
 "$cluster" "cluster memberships"                                                          
 "$wss"     "within-cluster sum of squares"                                                
 "$E"       "gain in cohesion (in %)"                                                      
 "$size"    "size of each cluster"                                                         
 "$scores"  "synthetic score of each cluster"                                              
 "$coef"    "coef of the linear combinations defining the synthetic scores of each cluster"
> summary(part)

Call:
cutreevar(obj = tree, k = 16, matsim = TRUE)



Data: 
   number of observations:  5845
   number of variables:  19
   number of clusters:  16

Cluster  1 : 
squared loading     correlation 
              1               1 


Cluster  2 : 
squared loading     correlation 
              1               1 


Cluster  3 : 
squared loading     correlation 
              1               1 


Cluster  4 : 
squared loading     correlation 
              1               1 


Cluster  5 : 
            squared loading correlation
tot_open_tr            0.88        0.96
tot_rev_tr             0.88        0.94


Cluster  6 : 
squared loading     correlation 
              1               1 


Cluster  7 : 
squared loading     correlation 
              1               1 


Cluster  8 : 
squared loading     correlation 
              1               1 


Cluster  9 : 
squared loading     correlation 
              1               1 


Cluster  10 : 
            squared loading correlation
loan_amt               0.93        0.96
purch_price            0.93        0.96
msrp                   0.90        0.95


Cluster  11 : 
squared loading     correlation 
              1               1 


Cluster  12 : 
squared loading     correlation 
              1               1 


Cluster  13 : 
squared loading     correlation 
              1               1 


Cluster  14 : 
squared loading     correlation 
              1               1 


Cluster  15 : 
squared loading     correlation 
              1               1 


Cluster  16 : 
squared loading     correlation 
              1               1 


Gain in cohesion (in %):  96.85
> part$sim                            #查看相关性矩阵
$cluster1
             vehicle_year
vehicle_year            1

$cluster2
          tot_derog
tot_derog         1

$cluster3
       tot_tr
tot_tr      1

$cluster4
              age_oldest_tr
age_oldest_tr             1

$cluster5
            tot_open_tr tot_rev_tr
tot_open_tr   1.0000000  0.5913962
tot_rev_tr    0.5913962  1.0000000

$cluster6
             tot_rev_debt
tot_rev_debt            1

$cluster7
             tot_rev_line
tot_rev_line            1

$cluster8
         rev_util
rev_util        1

$cluster9
           fico_score
fico_score          1

$cluster10
            purch_price      msrp  loan_amt
purch_price   1.0000000 0.7494910 0.8097108
msrp          0.7494910 1.0000000 0.7550834
loan_amt      0.8097108 0.7550834 1.0000000

$cluster11
         down_pyt
down_pyt        1

$cluster12
          loan_term
loan_term         1

$cluster13
    ltv
ltv   1

$cluster14
           tot_income
tot_income          1

$cluster15
            veh_mileage
veh_mileage           1

$cluster16
         used_ind
used_ind        1

> part$var   
$cluster1
squared loading     correlation 
              1               1 

$cluster2
squared loading     correlation 
              1               1 

$cluster3
squared loading     correlation 
              1               1 

$cluster4
squared loading     correlation 
              1               1 

$cluster5
            squared loading correlation
tot_open_tr       0.8845115   0.9597241
tot_rev_tr        0.8845115   0.9404847

$cluster6
squared loading     correlation 
              1               1 

$cluster7
squared loading     correlation 
              1               1 

$cluster8
squared loading     correlation 
              1               1 

$cluster9
squared loading     correlation 
              1               1 

$cluster10
            squared loading correlation
loan_amt          0.9275256   0.9630813
purch_price       0.9253048   0.9619276
msrp              0.9036108   0.9506082

$cluster11
squared loading     correlation 
              1               1 

$cluster12
squared loading     correlation 
              1               1 

$cluster13
squared loading     correlation 
              1               1 

$cluster14
squared loading     correlation 
              1               1 

$cluster15
squared loading     correlation 
              1               1 

$cluster16
squared loading     correlation 
              1               1 

 

posted @ 2020-11-01 19:23  瑶池里  阅读(978)  评论(1编辑  收藏  举报