pyspark 计算 KMeans
一、从csv读取数据
# header表示数据的第一行是否为列名
dataset = spark.read.format("csv").option("header",True).load("video_info.csv")
其中csv数据结构为:video_id,"feature1,feature2,featuren"
二、获取所有特征,并转换为特征-索引字典,用于后续构造特征向量
rdd = dataset.rdd.map(lambda x:("1",x["result"].split(","))).reduceByKey(myfunc).\
map(lambda x:x[1]).\
collect()
features = rdd[0]
feature2index = {item:index for index,item in enumerate(features)}
三、转换为KMeans输入格式的数据
def densevector(x):
"""
通过函数将rdd转换为矩阵
:param x:
:return:
"""
tmp_feature = x[1]
vec = [0.0 for _ in range(len_features)]
for item in tmp_feature:
vec[feature2index[item]] = 1.0
return x[0],Vectors.dense(vec)
data = data.map(densevector)
df = spark.createDataFrame(data,["label","features"])
四、通过轮廓系数,得到最佳聚类个数k
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
silhouette_score = []
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features',
metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(5,12):
algo = KMeans(featuresCol="features",k=i)
model = algo.fit(df)
output = model.transform(df)
score = evaluator.evaluate(output)
silhouette_score.append(score)
print(f"Silhouette Score in {i}:",score)
五、使用KMeans
kmeans = KMeans().setK(20).setSeed(1)
model = kmeans.fit(df.select("features"))
transformed = model.transform(df)
print(transformed.show())
参考文献
1、为什么k-means必须使用欧氏距离:https://stats.stackexchange.com/questions/81481/why-does-k-means-clustering-algorithm-use-only-euclidean-distance-metric