聚类-----高斯混合模型

package Spark_MLlib

import org.apache.spark.ml.clustering.GaussianMixture
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors}

case class GMM_Schema(features:Vector)
object 聚类__高斯混合 {
  val spark=SparkSession.builder().master("local[2]").getOrCreate()
  import spark.implicits._
  def main(args: Array[String]): Unit = {
    val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
               .map(_.split(",")).map(x=>GMM_Schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF()
    data.show()
    val Array(traindata,textData)=data.randomSplit(Array(0.7,0.3))
    val GMM=new GaussianMixture().setK(3).setProbabilityCol("Probability").setPredictionCol("Prediction")
    val GMM_model=GMM.fit(traindata)
   //Probaility表示样本属于各个聚簇的概率-->0,1,2三个簇各个的概率,prediction表示对样本的聚簇归属预测
    val result=GMM_model.transform(textData)
      result.show(false)
     //保存选中的列
extracte_data.select("features").rdd.saveAsTextFile("file:///home/soyo/桌面/spark编程测试数据/89987863.txt") //保存DataFrame所有
extracte_data.rdd.saveAsTextFile(
"file:///home/soyo/桌面/spark编程测试数据/89987862.txt")
    //GMM不直接给出聚类中心,而是给出各个混合成分(多元高斯分布)的参数
    //weights成员获取到各个混合成分的权重,使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵)
   for (i<-0 until(GMM_model.getK)){
     println("Component %d : weight is %f\n mu vector is %s\n sigma matrix is %s".format(i,GMM_model.weights(i),GMM_model.gaussians(i).mean,GMM_model.gaussians(i).cov))
   }
  }
}

结果:

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows

+-----------------+----------+------------------------------------------------------------------+
|features         |Prediction|Probability                                                       |
+-----------------+----------+------------------------------------------------------------------+
|[4.5,2.3,1.3,0.3]|1         |[1.9460993789131094E-10,0.6613517186358104,0.33864828116957957]   |
|[4.6,3.2,1.4,0.2]|1         |[2.7145624349052503E-15,0.9999999999625855,3.74117982257895E-11]  |
|[4.6,3.6,1.0,0.2]|1         |[8.857071769427218E-14,0.9999999995762952,4.2361636256573393E-10] |
|[4.7,3.2,1.3,0.2]|1         |[2.8280610349168036E-16,0.9999999997787022,2.2129751536671591E-10]|
|[4.7,3.2,1.6,0.2]|1         |[2.2246229283736778E-13,0.9999999999924508,7.326722006026576E-12] |
|[4.8,3.0,1.4,0.3]|1         |[1.5570916910918913E-14,0.999999515511457,4.844885274590749E-7]   |
|[4.8,3.1,1.6,0.2]|1         |[5.416790617095303E-13,0.9999999998341826,1.652755935543129E-10]  |
|[4.8,3.4,1.9,0.2]|1         |[1.1345882300938586E-9,0.9999999988651723,2.394366594145129E-13]  |
|[5.0,3.0,1.6,0.2]|1         |[2.174953081273265E-12,0.9999999896690439,1.0328781109739351E-8]  |
|[5.0,3.3,1.4,0.2]|1         |[9.157665389080891E-17,0.9999999999852162,1.4783767398737057E-11] |
|[5.0,3.4,1.6,0.4]|1         |[1.1903839950520247E-15,0.9999999903439921,9.656006771864786E-9]  |
|[5.1,3.4,1.5,0.2]|1         |[1.0337982164910104E-16,0.9999999999990304,9.69373338566045E-13]  |
|[5.1,3.7,1.5,0.4]|1         |[8.255687030250876E-17,0.9999999999716316,2.8368326155201214E-11] |
|[5.1,3.8,1.9,0.4]|1         |[6.664693730316072E-14,0.9999999999870913,1.2842010001494045E-11] |
|[5.2,3.5,1.5,0.2]|1         |[5.519983601218073E-17,0.9999999999998658,1.3428034253525153E-13] |
|[5.2,4.1,1.5,0.1]|1         |[9.520667996704964E-15,0.9999999999999809,9.520667236660166E-15]  |
|[5.4,3.4,1.7,0.2]|1         |[1.5037240722382337E-14,0.9999999999919748,8.01024821156934E-12]  |
|[5.5,2.3,4.0,1.3]|2         |[0.12790930371263204,9.702982800125614E-16,0.8720906962873669]    |
|[5.5,3.5,1.3,0.2]|1         |[7.495980013661814E-16,0.9999999999984027,1.5966275150172275E-12] |
|[5.7,2.8,4.5,1.3]|0         |[0.9627079132449172,6.238048595532193E-16,0.03729208675508215]    |
+-----------------+----------+------------------------------------------------------------------+
only showing top 20 rows

Component 0 : weight is 0.410444
 mu vector is [6.229890449633949,2.9365709066142216,5.119923101567097,1.875036196728866]
 sigma matrix is 0.20106023109900695   0.061614617783844784  0.18615301343118426  0.12419808100465818  
0.061614617783844784  0.08755324619180453   0.07600720617502173  0.07092234972645906  
0.18615301343118426   0.07600720617502173   0.3190627635701519   0.2005911061068125   
0.12419808100465818   0.07092234972645906   0.2005911061068125   0.18570154951117543  
Component 1 : weight is 0.313131
 mu vector is [5.016129011272877,3.4516129375085347,1.4354837490859222,0.24516112308927235]
 sigma matrix is 0.13554637908317346   0.11368373014150743   0.011685744418836571  0.015723208692113348  
0.11368373014150743   0.1399168236701324    0.009136331548362716  0.018314273558132532  
0.011685744418836571  0.009136331548362716  0.018418296421506785  0.006461987075877571  
0.015723208692113348  0.018314273558132532  0.006461987075877571  0.014089446355933986  
Component 2 : weight is 0.276425
 mu vector is [6.268895982220705,2.8164655748162817,4.672112358525022,1.4474090607905907]
 sigma matrix is 0.7598437470441146   0.25905873612421504  0.8900973784969908   0.2622593125673333   
0.25905873612421504  0.18040816341127572  0.2866224809874453   0.10562720921328374  
0.8900973784969908   0.2866224809874453   1.1853685078670806   0.35992793746391144  
0.2622593125673333   0.10562720921328374  0.35992793746391144  0.12216925649336183  

posted @ 2017-11-10 14:59  soyosuyang  阅读(481)  评论(0编辑  收藏  举报