pyspark中的DenseVector和sparseVector

引用链接https://www.jianshu.com/p/80c98ae72db2

在pyspark中的vector有两种类型,一种是DenseVector,其与一般的列表或者array数组形式非常相似;另一种则是SparseVector,这种vector在保存数据的时候保存三个信息:向量长度,向量非零值的索引以及索引处的值。这两种向量也是可以相互转换的。

from pyspark.ml.linalg import DenseVector, Vector
import numpy as np

df = sc.parallelize([
    (1, DenseVector([57.0, 1.0, 0.0, 0.0])),
    (2, DenseVector([63.0, float("NaN"), 0.0, 0.0])),
    (3, DenseVector([74.0, 1.0, 3.0, float("NaN")])),
    (4, DenseVector([67.0, float("NaN"), 0.0, 0.0])),
    (5, DenseVector([float("NaN"), 1.0, float("NaN"), float("NaN")])),
]).toDF(["id", "features"])

df.show()
+---+------------------+
| id|          features|
+---+------------------+
|  1|[57.0,1.0,0.0,0.0]|
|  2|[63.0,NaN,0.0,0.0]|
|  3|[74.0,1.0,3.0,NaN]|
|  4|[67.0,NaN,0.0,0.0]|
|  5| [NaN,1.0,NaN,NaN]|
+---+------------------+

########################################
## DensVector to SparseVector

import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT, DenseVector
from pyspark.sql.functions import udf, col

def dense_to_sparse(vector):
    return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)

to_sparse = udf(dense_to_sparse, VectorUDT())
df = df.withColumn("sparse", to_sparse(col("features")))
df.show(truncate=False)
+---+------------------+--------------------------------+
|id |features          |sparse                          |
+---+------------------+--------------------------------+
|1  |[57.0,1.0,0.0,0.0]|(4,[0,1],[57.0,1.0])            |
|2  |[63.0,NaN,0.0,0.0]|(4,[0,1],[63.0,NaN])            |
|3  |[74.0,1.0,3.0,NaN]|(4,[0,1,2,3],[74.0,1.0,3.0,NaN])|
|4  |[67.0,NaN,0.0,0.0]|(4,[0,1],[67.0,NaN])            |
|5  |[NaN,1.0,NaN,NaN] |(4,[0,1,2,3],[NaN,1.0,NaN,NaN]) |
+---+------------------+--------------------------------+

###################################
## SparseVector to DenseVector

def sparse2dense(v):
    return DenseVector(v.toArray())

s2d = udf(sparse2dense, VectorUDT())

df = df.withColumn('dense', s2d(col('sparse')))
df.show()
+---+------------------+--------------------------------+------------------+
|id |features          |sparse                          |dense             |
+---+------------------+--------------------------------+------------------+
|1  |[57.0,1.0,0.0,0.0]|(4,[0,1],[57.0,1.0])            |[57.0,1.0,0.0,0.0]|
|2  |[63.0,NaN,0.0,0.0]|(4,[0,1],[63.0,NaN])            |[63.0,NaN,0.0,0.0]|
|3  |[74.0,1.0,3.0,NaN]|(4,[0,1,2,3],[74.0,1.0,3.0,NaN])|[74.0,1.0,3.0,NaN]|
|4  |[67.0,NaN,0.0,0.0]|(4,[0,1],[67.0,NaN])            |[67.0,NaN,0.0,0.0]|
|5  |[NaN,1.0,NaN,NaN] |(4,[0,1,2,3],[NaN,1.0,NaN,NaN]) |[NaN,1.0,NaN,NaN] |
+---+------------------+--------------------------------+------------------+
posted @ 2022-06-30 11:06  半个程序猿Cohen_Lee  阅读(272)  评论(0编辑  收藏  举报