Spark Rowmatrix 分布式矩阵
import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix val df1 = Seq( | (1.0, 2.0, 3.0), | (1.1, 2.1, 3.1), | (1.2, 2.2, 3.2)).toDF("c1", "c2", "c3") df1: org.apache.spark.sql.DataFrame = [c1: double, c2: double ... 1 more field] df1.show +---+---+---+ | c1| c2| c3| +---+---+---+ |1.0|2.0|3.0| |1.1|2.1|3.1| |1.2|2.2|3.2| +---+---+---+ // DataFrame转换成RDD[Vector] val rowsVector= df1.rdd.map { x => Vectors.dense( x(0).toString().toDouble, x(1).toString().toDouble, x(2).toString().toDouble) } rowsVector: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[4] at map // Create a RowMatrix from an RDD[Vector]. val mat1: RowMatrix = new RowMatrix(rowsVector) mat1: org.apache.spark.mllib.linalg.distributed.RowMatrix = org.apache.spark.mllib.linalg.distributed.RowMatrix@7ba821ef // Get its size. val m = mat1.numRows() m: Long = 3 val n = mat1.numCols() n: Long = 3 // 将RowMatrix转换成DataFrame val resDF = mat1.rows.map { x => (x(0).toDouble, x(1).toDouble, x(2).toDouble) }.toDF("c1", "c2", "c3") resDF: org.apache.spark.sql.DataFrame = [c1: double, c2: double ... 1 more field] resDF.show +---+---+---+ | c1| c2| c3| +---+---+---+ |1.0|2.0|3.0| |1.1|2.1|3.1| |1.2|2.2|3.2| +---+---+---+ mat1.rows.collect().take(10) res3: Array[org.apache.spark.mllib.linalg.Vector] = Array([1.0,2.0,3.0], [1.1,2.1,3.1], [1.2,2.2,3.2])
CoordinateMatrix坐标矩阵
import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} // 第一列:行坐标;第二列:列坐标;第三列:矩阵元素 val df = Seq( | (0, 0, 1.1), (0, 1, 1.2), (0, 2, 1.3), | (1, 0, 2.1), (1, 1, 2.2), (1, 2, 2.3), | (2, 0, 3.1), (2, 1, 3.2), (2, 2, 3.3), | (3, 0, 4.1), (3, 1, 4.2), (3, 2, 4.3)).toDF("row", "col", "value") df: org.apache.spark.sql.DataFrame = [row: int, col: int ... 1 more field] df.show +---+---+-----+ |row|col|value| +---+---+-----+ | 0| 0| 1.1| | 0| 1| 1.2| | 0| 2| 1.3| | 1| 0| 2.1| | 1| 1| 2.2| | 1| 2| 2.3| | 2| 0| 3.1| | 2| 1| 3.2| | 2| 2| 3.3| | 3| 0| 4.1| | 3| 1| 4.2| | 3| 2| 4.3| +---+---+-----+ // 生成入口矩阵 val entr = df.rdd.map { x => | val a = x(0).toString().toLong | val b = x(1).toString().toLong | val c = x(2).toString().toDouble | MatrixEntry(a, b, c) | } entr: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = MapPartitionsRDD[20] at map // 生成坐标矩阵 val mat: CoordinateMatrix = new CoordinateMatrix(entr) mat: org.apache.spark.mllib.linalg.distributed.CoordinateMatrix = org.apache.spark.mllib.linalg.distributed.CoordinateMatrix@5381deec mat.numRows() res5: Long = 4 mat.numCols() res6: Long = 3 mat.entries.collect().take(10) res7: Array[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = Array(MatrixEntry(0,0,1.1), MatrixEntry(0,1,1.2), MatrixEntry(0,2,1.3), MatrixEntry(1,0,2.1), MatrixEntry(1,1,2.2), MatrixEntry(1,2,2.3), MatrixEntry(2,0,3.1), MatrixEntry(2,1,3.2), MatrixEntry(2,2,3.3), MatrixEntry(3,0,4.1)) // 坐标矩阵转成,带行索引的DataFrame,行索引为行坐标 val t = mat.toIndexedRowMatrix().rows.map { x => | val v=x.vector | (x.index,v(0).toDouble, v(1).toDouble, v(2).toDouble) | } t: org.apache.spark.rdd.RDD[(Long, Double, Double, Double)] = MapPartitionsRDD[33] at map t.toDF().show +---+---+---+---+ | _1| _2| _3| _4| +---+---+---+---+ | 0|1.1|1.2|1.3| | 1|2.1|2.2|2.3| | 2|3.1|3.2|3.3| | 3|4.1|4.2|4.3| +---+---+---+---+ // 坐标矩阵转成DataFrame val t1 = mat.toRowMatrix().rows.map { x => | (x(0).toDouble, x(1).toDouble, x(2).toDouble) | } t1: org.apache.spark.rdd.RDD[(Double, Double, Double)] = MapPartitionsRDD[26] at map t1.toDF().show +---+---+---+ | _1| _2| _3| +---+---+---+ |1.1|1.2|1.3| |3.1|3.2|3.3| |2.1|2.2|2.3| |4.1|4.2|4.3| +---+---+---+