Loading

基于RDD的降维API

概述

降维是减少所考虑的变量的数量,它可以从原始和杂乱特征中提取潜在特征或者说价值更高的特征,能够在压缩数据的大小的时候尽量不影响算法的结果。

奇异分解(SVD)

给出一篇比较好的博客:https://www.cnblogs.com/pinard/p/6251584.html
在Spark mllib中RowMatrix就支持计算SVD。所以直接调用RowMatrix.computeSVD(k : scala.Int, computeU : scala.Boolean = { /* compiled code / }, rCond : scala.Double = { / compiled code */ })

package com.SVDTest

import org.apache.spark.mllib.linalg
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

object Main {
  def main(args: Array[String]): Unit = {
    //TODO 创建环境
    val conf: SparkConf = new SparkConf().setAppName("test").setMaster("local[*]")
    val sc = new SparkContext(conf)
    //TODO 数据操作
    //数据
    val dataArr: Array[linalg.Vector] = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val data: RDD[linalg.Vector] = sc.makeRDD(dataArr)
    val mat = new RowMatrix(data)

    //计算前5个奇异值和对应的奇异向量
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U
    val s: linalg.Vector = svd.s
    val V: Matrix = svd.V
    //TODO 关闭环境
    sc.stop()
  }
}

PCA

在Spark MLlib中RowMatrix支持计算PCA。

package com.PCATest01


import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.{Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Main {
  def main(args: Array[String]): Unit = {
    //TODO 创建环境
    val conf: SparkConf = new SparkConf().setAppName("test").setMaster("local[*]")
    val sc = new SparkContext(conf)
    //TODO 数据操作
    //数据
    val dataArr: Array[linalg.Vector] = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val dataRdd: RDD[linalg.Vector] = sc.makeRDD(dataArr)
    val mat = new RowMatrix(dataRdd)
    //计算前4个主成分
    val pc: Matrix = mat.computePrincipalComponents(4)
    //将数据投影到低维度空间中
    val y: RowMatrix = mat.multiply(pc)
    //输出
    println("转化矩阵:"+pc.toString())
    println("投影后的结果:")
    y.rows.foreach(println)
    //TODO 关闭环境
    sc.stop()
  }
}

下面的代码是计算带有标签的特征矩阵的PCA过程。

package com.PCATest02

import org.apache.spark.mllib.feature.{PCA, PCAModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Main {
  def main(args: Array[String]): Unit = {
    //TODO 创建环境
    val conf: SparkConf = new SparkConf().setAppName("test").setMaster("local[*]")
    val sc = new SparkContext(conf)
    //TODO 数据操作
    val data: RDD[LabeledPoint] = sc.makeRDD(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    //计算前3的PCA
    val pc: PCAModel = new PCA(3).fit(data.map(_.features))
    //进行投影并保留标签
    val newData: RDD[LabeledPoint] = data.map(p => p.copy(features = pc.transform(p.features)))
    //输出
    newData.foreach(println)
    //TODO 关闭环境
    sc.stop()
    }
}
posted @ 2023-02-13 22:38  青山新雨  阅读(10)  评论(0编辑  收藏  举报