ruijiege

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::
#coding=utf-8
import numpy as np
import os
import random
from tfidf_model import TfIdf

# a = np.array([0, 1, 2, 3, 1, 2, 2])
# print(a[[2,4,2,2,2,2]])
# # 设置随机种子
# random.seed(4)

class Kmeans:
    def __init__(self,doc,k,max_iter):
        self.doc = doc
        self.k = k
        self.max_iter = max_iter
        self.tf_idf = TfIdf(doc)
        self.tf_idf.cal_tfidf()
    
    def train(self):
        # 1、随机初始化k个蔟中心
        cluster_center = {i:self.tf_idf.tfidf[i] for i in range(self.k)}
        kmean_iter = 1
        while True:
            # 2、计算每篇文本k个蔟中心的距离
            doc_dist = np.array([[self.cal_dist(cluster_center[i],sent) for i in range(self.k)] for sent in self.tf_idf.tfidf])
            doc_dist_argsort = np.argmax(doc_dist,axis=1)     # 每篇文本和他最近的蔟中心
            # 3、把数据划分到对应的蔟集合
            cluster_set = {i:np.argwhere(doc_dist_argsort==i).reshape(-1) for i in range(self.k)}
            # 4、重新计算蔟中心
            cluster_center = {i:np.mean(self.tf_idf.tfidf[cluster_set[i]],axis=0)  for  i in range(self.k)}
            # 5、设置停止条件
            if kmean_iter>self.max_iter:
                break
            kmean_iter += 1  
            print(kmean_iter)
            print(cluster_set)
        
    @staticmethod
    def cal_dist(vec1,vec2):
        return round(np.dot(vec1,vec2)/np.sqrt((np.dot(vec1,vec1)*np.dot(vec2,vec2))),4)     
    
if __name__ == "__main__":
    doc_dir = 'test_text'
    doc = []
    for file_name in os.listdir(doc_dir):
        file_path = os.path.join(doc_dir,file_name)
        with open(file_path,encoding="utf-8") as f:
            doc.append(f.read())
    kmeans = Kmeans(doc, 3, 100)  
    kmeans.train()
    
    
posted on 2021-11-30 17:08  哦哟这个怎么搞  阅读(27)  评论(0编辑  收藏  举报