http://f.dataguru.cn/spark-751832-1-1.html

 

我们可以利用PCA算法将向量的维数降低,从而实现特征转化。
具体原理在《机器学习》课程中有详细的讲述。故此处将直接介绍如何利用MLlib中的PCA算法进行特征转换。下列演示了如何计算主成分向量并实现向量降维,同时保持标签计算线性回归。

import org.apache.spark.mllib.regression.RegressionWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.feature.PCA

       val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
         val parts = line.split(',')
         LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split('').map(_.toDouble)))
         }.cache()
         val splits = data.randomSplit(Array(0.6,0.4),seed=11L)
         val training = splits(0).cache()
         val test = splits(1)
         val pca = new PCA(training.first.feature.size/2).fit(data.map(_.features))
         val training_pca = training.map(p=>p.copy(features=pca.transform(p.features)))
         val test_pca = test.map(p=>p.copy(features=pca.transform(p.features)))
         val num Iterations = 100
         
         val model = LinearRegressionWithSGD.train(training, numIterations)
         val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
         val valuesAndPreds = test.map { point =>
             val score = model.predict(point.features)
                 (score, point.label)
         }
         
         val MSE = valuesAndPreds.map{ case(v,p) =>math.pow(v-p),2)}.mean()
         val MSE_pca = valuesAndPreds_pca.map{ case(v,p) => math.pow((v-p),2)}.mean()
         println("Means Squared Error="+MSE)
         println("PCA Means Squared Error="+MSE_pca)