聚类-----高斯混合模型
package Spark_MLlib import org.apache.spark.ml.clustering.GaussianMixture import org.apache.spark.sql.SparkSession import org.apache.spark.ml.linalg.{Vector, Vectors} case class GMM_Schema(features:Vector) object 聚类__高斯混合 { val spark=SparkSession.builder().master("local[2]").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt") .map(_.split(",")).map(x=>GMM_Schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF() data.show() val Array(traindata,textData)=data.randomSplit(Array(0.7,0.3)) val GMM=new GaussianMixture().setK(3).setProbabilityCol("Probability").setPredictionCol("Prediction") val GMM_model=GMM.fit(traindata) //Probaility表示样本属于各个聚簇的概率-->0,1,2三个簇各个的概率,prediction表示对样本的聚簇归属预测 val result=GMM_model.transform(textData) result.show(false)
//保存选中的列
extracte_data.select("features").rdd.saveAsTextFile("file:///home/soyo/桌面/spark编程测试数据/89987863.txt") //保存DataFrame所有
extracte_data.rdd.saveAsTextFile("file:///home/soyo/桌面/spark编程测试数据/89987862.txt")
//GMM不直接给出聚类中心,而是给出各个混合成分(多元高斯分布)的参数 //weights成员获取到各个混合成分的权重,使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵) for (i<-0 until(GMM_model.getK)){ println("Component %d : weight is %f\n mu vector is %s\n sigma matrix is %s".format(i,GMM_model.weights(i),GMM_model.gaussians(i).mean,GMM_model.gaussians(i).cov)) } } }
结果:
+-----------------+
| features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows
+-----------------+----------+------------------------------------------------------------------+
|features |Prediction|Probability |
+-----------------+----------+------------------------------------------------------------------+
|[4.5,2.3,1.3,0.3]|1 |[1.9460993789131094E-10,0.6613517186358104,0.33864828116957957] |
|[4.6,3.2,1.4,0.2]|1 |[2.7145624349052503E-15,0.9999999999625855,3.74117982257895E-11] |
|[4.6,3.6,1.0,0.2]|1 |[8.857071769427218E-14,0.9999999995762952,4.2361636256573393E-10] |
|[4.7,3.2,1.3,0.2]|1 |[2.8280610349168036E-16,0.9999999997787022,2.2129751536671591E-10]|
|[4.7,3.2,1.6,0.2]|1 |[2.2246229283736778E-13,0.9999999999924508,7.326722006026576E-12] |
|[4.8,3.0,1.4,0.3]|1 |[1.5570916910918913E-14,0.999999515511457,4.844885274590749E-7] |
|[4.8,3.1,1.6,0.2]|1 |[5.416790617095303E-13,0.9999999998341826,1.652755935543129E-10] |
|[4.8,3.4,1.9,0.2]|1 |[1.1345882300938586E-9,0.9999999988651723,2.394366594145129E-13] |
|[5.0,3.0,1.6,0.2]|1 |[2.174953081273265E-12,0.9999999896690439,1.0328781109739351E-8] |
|[5.0,3.3,1.4,0.2]|1 |[9.157665389080891E-17,0.9999999999852162,1.4783767398737057E-11] |
|[5.0,3.4,1.6,0.4]|1 |[1.1903839950520247E-15,0.9999999903439921,9.656006771864786E-9] |
|[5.1,3.4,1.5,0.2]|1 |[1.0337982164910104E-16,0.9999999999990304,9.69373338566045E-13] |
|[5.1,3.7,1.5,0.4]|1 |[8.255687030250876E-17,0.9999999999716316,2.8368326155201214E-11] |
|[5.1,3.8,1.9,0.4]|1 |[6.664693730316072E-14,0.9999999999870913,1.2842010001494045E-11] |
|[5.2,3.5,1.5,0.2]|1 |[5.519983601218073E-17,0.9999999999998658,1.3428034253525153E-13] |
|[5.2,4.1,1.5,0.1]|1 |[9.520667996704964E-15,0.9999999999999809,9.520667236660166E-15] |
|[5.4,3.4,1.7,0.2]|1 |[1.5037240722382337E-14,0.9999999999919748,8.01024821156934E-12] |
|[5.5,2.3,4.0,1.3]|2 |[0.12790930371263204,9.702982800125614E-16,0.8720906962873669] |
|[5.5,3.5,1.3,0.2]|1 |[7.495980013661814E-16,0.9999999999984027,1.5966275150172275E-12] |
|[5.7,2.8,4.5,1.3]|0 |[0.9627079132449172,6.238048595532193E-16,0.03729208675508215] |
+-----------------+----------+------------------------------------------------------------------+
only showing top 20 rows
Component 0 : weight is 0.410444
mu vector is [6.229890449633949,2.9365709066142216,5.119923101567097,1.875036196728866]
sigma matrix is 0.20106023109900695 0.061614617783844784 0.18615301343118426 0.12419808100465818
0.061614617783844784 0.08755324619180453 0.07600720617502173 0.07092234972645906
0.18615301343118426 0.07600720617502173 0.3190627635701519 0.2005911061068125
0.12419808100465818 0.07092234972645906 0.2005911061068125 0.18570154951117543
Component 1 : weight is 0.313131
mu vector is [5.016129011272877,3.4516129375085347,1.4354837490859222,0.24516112308927235]
sigma matrix is 0.13554637908317346 0.11368373014150743 0.011685744418836571 0.015723208692113348
0.11368373014150743 0.1399168236701324 0.009136331548362716 0.018314273558132532
0.011685744418836571 0.009136331548362716 0.018418296421506785 0.006461987075877571
0.015723208692113348 0.018314273558132532 0.006461987075877571 0.014089446355933986
Component 2 : weight is 0.276425
mu vector is [6.268895982220705,2.8164655748162817,4.672112358525022,1.4474090607905907]
sigma matrix is 0.7598437470441146 0.25905873612421504 0.8900973784969908 0.2622593125673333
0.25905873612421504 0.18040816341127572 0.2866224809874453 0.10562720921328374
0.8900973784969908 0.2866224809874453 1.1853685078670806 0.35992793746391144
0.2622593125673333 0.10562720921328374 0.35992793746391144 0.12216925649336183