#coding=utf-8 import numpy as np import os import random from tfidf_model import TfIdf # a = np.array([0, 1, 2, 3, 1, 2, 2]) # print(a[[2,4,2,2,2,2]]) # # 设置随机种子 # random.seed(4) class Kmeans: def __init__(self,doc,k,max_iter): self.doc = doc self.k = k self.max_iter = max_iter self.tf_idf = TfIdf(doc) self.tf_idf.cal_tfidf() def train(self): # 1、随机初始化k个蔟中心 cluster_center = {i:self.tf_idf.tfidf[i] for i in range(self.k)} kmean_iter = 1 while True: # 2、计算每篇文本k个蔟中心的距离 doc_dist = np.array([[self.cal_dist(cluster_center[i],sent) for i in range(self.k)] for sent in self.tf_idf.tfidf]) doc_dist_argsort = np.argmax(doc_dist,axis=1) # 每篇文本和他最近的蔟中心 # 3、把数据划分到对应的蔟集合 cluster_set = {i:np.argwhere(doc_dist_argsort==i).reshape(-1) for i in range(self.k)} # 4、重新计算蔟中心 cluster_center = {i:np.mean(self.tf_idf.tfidf[cluster_set[i]],axis=0) for i in range(self.k)} # 5、设置停止条件 if kmean_iter>self.max_iter: break kmean_iter += 1 print(kmean_iter) print(cluster_set) @staticmethod def cal_dist(vec1,vec2): return round(np.dot(vec1,vec2)/np.sqrt((np.dot(vec1,vec1)*np.dot(vec2,vec2))),4) if __name__ == "__main__": doc_dir = 'test_text' doc = [] for file_name in os.listdir(doc_dir): file_path = os.path.join(doc_dir,file_name) with open(file_path,encoding="utf-8") as f: doc.append(f.read()) kmeans = Kmeans(doc, 3, 100) kmeans.train()