pyspark中的DenseVector和sparseVector
引用链接https://www.jianshu.com/p/80c98ae72db2
在pyspark中的vector有两种类型,一种是DenseVector,其与一般的列表或者array数组形式非常相似;另一种则是SparseVector,这种vector在保存数据的时候保存三个信息:向量长度,向量非零值的索引以及索引处的值。这两种向量也是可以相互转换的。
from pyspark.ml.linalg import DenseVector, Vector
import numpy as np
df = sc.parallelize([
(1, DenseVector([57.0, 1.0, 0.0, 0.0])),
(2, DenseVector([63.0, float("NaN"), 0.0, 0.0])),
(3, DenseVector([74.0, 1.0, 3.0, float("NaN")])),
(4, DenseVector([67.0, float("NaN"), 0.0, 0.0])),
(5, DenseVector([float("NaN"), 1.0, float("NaN"), float("NaN")])),
]).toDF(["id", "features"])
df.show()
+---+------------------+
| id| features|
+---+------------------+
| 1|[57.0,1.0,0.0,0.0]|
| 2|[63.0,NaN,0.0,0.0]|
| 3|[74.0,1.0,3.0,NaN]|
| 4|[67.0,NaN,0.0,0.0]|
| 5| [NaN,1.0,NaN,NaN]|
+---+------------------+
########################################
## DensVector to SparseVector
import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT, DenseVector
from pyspark.sql.functions import udf, col
def dense_to_sparse(vector):
return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)
to_sparse = udf(dense_to_sparse, VectorUDT())
df = df.withColumn("sparse", to_sparse(col("features")))
df.show(truncate=False)
+---+------------------+--------------------------------+
|id |features |sparse |
+---+------------------+--------------------------------+
|1 |[57.0,1.0,0.0,0.0]|(4,[0,1],[57.0,1.0]) |
|2 |[63.0,NaN,0.0,0.0]|(4,[0,1],[63.0,NaN]) |
|3 |[74.0,1.0,3.0,NaN]|(4,[0,1,2,3],[74.0,1.0,3.0,NaN])|
|4 |[67.0,NaN,0.0,0.0]|(4,[0,1],[67.0,NaN]) |
|5 |[NaN,1.0,NaN,NaN] |(4,[0,1,2,3],[NaN,1.0,NaN,NaN]) |
+---+------------------+--------------------------------+
###################################
## SparseVector to DenseVector
def sparse2dense(v):
return DenseVector(v.toArray())
s2d = udf(sparse2dense, VectorUDT())
df = df.withColumn('dense', s2d(col('sparse')))
df.show()
+---+------------------+--------------------------------+------------------+
|id |features |sparse |dense |
+---+------------------+--------------------------------+------------------+
|1 |[57.0,1.0,0.0,0.0]|(4,[0,1],[57.0,1.0]) |[57.0,1.0,0.0,0.0]|
|2 |[63.0,NaN,0.0,0.0]|(4,[0,1],[63.0,NaN]) |[63.0,NaN,0.0,0.0]|
|3 |[74.0,1.0,3.0,NaN]|(4,[0,1,2,3],[74.0,1.0,3.0,NaN])|[74.0,1.0,3.0,NaN]|
|4 |[67.0,NaN,0.0,0.0]|(4,[0,1],[67.0,NaN]) |[67.0,NaN,0.0,0.0]|
|5 |[NaN,1.0,NaN,NaN] |(4,[0,1,2,3],[NaN,1.0,NaN,NaN]) |[NaN,1.0,NaN,NaN] |
+---+------------------+--------------------------------+------------------+