Kmeans中文聚类
中文文本kmeans聚类
原理:
K就是将原始数据分为K类,Means即均值点。K-Means的核心就是将一堆数据聚集为K个簇,每个簇中都有一个中心点称为均值点,簇中所有点到该簇的均值点的距离都较到其他簇的均值点更近。
实现步骤:
1、给出k个初始聚类中心
2、重复执行:
把每一个数据对象重新分配到k个聚类中心处,形成k个簇
重新计算每一个簇的聚类中心
3、直到聚类中心不在发生变化时,此时分类结束
两种方法:
①
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.cluster import KMeans from sklearn import metrics import numpy as np; import jieba from DBUtils import update_keyword def easy_get_parameter_k_means(): data = [] datas = [] file = open("keyword.txt" , encoding='utf-8') for post in file: data.append(post.replace('\n','')) datas = data vec = CountVectorizer() X = vec.fit_transform([" ".join([b for b in jieba.cut(a)]) for a in data]) tf = TfidfTransformer() X = tf.fit_transform(X.toarray()) data = X.toarray() test_score = [] n_clusters_end = 20 # 聚类个数 n_clusters_start = 20 # 聚类个数 while n_clusters_start <= n_clusters_end: km = KMeans(n_clusters=n_clusters_start) km.fit(data) clusters = km.labels_.tolist() # print(type(clusters)) # print(clusters) score = metrics.silhouette_score(X=X, labels=clusters) num = sorted([(np.sum([1 for a in clusters if a == i]), i) for i in set(clusters)])[-1] test_score.append([n_clusters_start, score, num[0], num[1]]) #print([n_clusters_start, score, num[0], num[1]]) # 输出分数 n_clusters_start += 1 for i in range(0, 20): result = [] #print('len(clusters):',len(clusters)) for index in range(len(clusters)): if clusters[index] == i: res = datas[index] update_keyword(res,str(i)) print("更新关键词为:",res,'的分类号为',i) result.append(res) #print('res',res) #print("第",i,"类,共", len(result), "个") return clusters # easy_get_parameter_k_means() # 得到最佳参数 print("arrs",easy_get_parameter_k_means()) print("arrs[length]",len(easy_get_parameter_k_means()))
②此方法是读取多个文件构建词库空间,但是当数据文件过多时运行很慢很慢
求特征值 求TF-IDF 利用kmeans算法求聚类中心 和 聚类分类以及各点距离其聚类中心的距离
import os; import jieba; import numpy as np; from numpy import * import matplotlib.pyplot as plt import os def read_from_file(file_name): # 读取原文章 with open(file_name, "r", encoding='UTF8') as fp: words = fp.read() return words def stop_words(stop_word_file): words = read_from_file(stop_word_file) result = jieba.cut(words) new_words = [] for r in result: new_words.append(r) return set(new_words) def del_stop_words(words, stop_words_set): # words是已经切词但是没有去除停用词的文档。 # 返回的会是去除停用词后的文档 result = jieba.cut(words) new_words = [] for r in result: if r not in stop_words_set: new_words.append(r) return new_words def get_all_vector(stop_words_set): # names = [os.path.join(file_path, f) for f in os.listdir(file_path)] docs = [] word_set = set() file = open("keyword.txt" , encoding='utf-8') for post in file: doc = del_stop_words(post, stop_words_set) docs.append(doc) word_set |= set(doc) # print len(doc),len(word_set) # print("word_Set:",word_set) # print("docs:", docs) word_set = list(word_set) docs_vsm = [] # for word in word_set[:30]: # print word.encode("utf-8"), for doc in docs: temp_vector = [] for word in word_set: temp_vector.append(doc.count(word) * 1.0) # print temp_vector[-30:-1] docs_vsm.append(temp_vector) docs_matrix = np.array(docs_vsm) print("docs_matrix:", docs_matrix) column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])] column_sum = np.array(column_sum) column_sum = docs_matrix.shape[0] / column_sum idf = np.log(column_sum) idf = np.diag(idf) # 注意一下计算都是矩阵运算,不是单个变量的运算。 for doc_v in docs_matrix: if doc_v.sum() == 0: doc_v = doc_v / 1 else: doc_v = doc_v / (doc_v.sum()) tfidf = np.dot(docs_matrix, idf) # return names, tfidf print("idf:", tfidf) f = "tezheng.txt" with open(f, "w", encoding='utf8') as file: # ”w"代表着每次运行都覆盖内容 for i in tfidf: for j in i: datafl = str(format(float(j), '.2f')) file.write(datafl + "\t") file.write("\n") def loadDataSet(fileName): dataSet = [] # 初始化一个空列表 fr = open(fileName) for line in fr.readlines(): # 切割每一行的数据 curLine = line.strip().split('\t') # 将数据追加到dataMat,映射所有的元素为 float类型 fltLine = list(map(float, curLine)) dataSet.append(fltLine) return mat(dataSet) ''' def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作 for j in range(n):#在每个维度的边界内,创建簇心。 minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j]) - minJ) centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) return centroids def randCent(dataSet,k): m,n = dataSet.shape centroids = np.zeros((k,n)) for i in range(k): index = int(np.random.uniform(0,m)) # centroids[i,:] = dataSet[index,:] return centroids ''' def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k, n))) # create centroid mat for j in range(n): # create random cluster centers, within bounds of each dimension minJ = min(dataSet[:, j]) rangeJ = float(max(dataSet[:, j]) - minJ) centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1)) return centroids def distEclud(vecA, vecB): return math.sqrt(sum(power(vecA - vecB, 2))) # dataSet样本点,k 簇的个数 # disMeas距离量度,默认为欧几里得距离 # createCent,初始点的选取 ''' def K_means(dataSet,k,distMeas = distEclud,createCent = randCent): print("样本点:::",dataSet) m = shape(dataSet)[0]#样本数 print('样本数:',m) clusterAssment = mat(zeros((m,2)))#m*2的矩阵 centroids = createCent(dataSet,k)#初始化k个中心 clusterChanged = True while clusterChanged:#当聚类不再变化 clusterChanged = False for i in range(m): minDist = math.inf;minIndex = -1 for j in range(k):#找到最近的质心 distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI;minIndex = j if clusterAssment[i,0] !=minIndex:clusterChanged = True #第一列为所属质心,第二列为距离 clusterAssment[i,:] = minIndex,minDist**2 print(centroids) #更改质心位置 for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] centroids[cent,:] = mean(ptsInClust,axis=0) return centroids,clusterAssment ''' def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] # 样本数 clusterAssment = mat(zeros((m, 2))) # m*2的矩阵 centroids = createCent(dataSet, k) # 初始化k个中心 clusterChanged = True while clusterChanged: # 当聚类不再变化 clusterChanged = False for i in range(m): minDist = inf; minIndex = -1 for j in range(k): # 找到最近的质心 distJI = distMeas(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i, 0] != minIndex: clusterChanged = True # 第1列为所属质心,第2列为距离 clusterAssment[i, :] = minIndex, minDist ** 2 print(centroids) # 更改质心位置 for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] centroids[cent, :] = mean(ptsInClust, axis=0) return centroids, clusterAssment if __name__ == '__main__': wenzhang = read_from_file('keyword.txt') # print(wenzhang) wenzhang1 = stop_words('stopword.txt') # print(wenzhang1) wenzhang2 = del_stop_words(wenzhang, wenzhang1) # print(wenzhang2) wenzhang3 = get_all_vector( wenzhang1) # kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent) dataSet = loadDataSet('tezheng.txt') centroids, clusterAssment = kMeans(dataSet, 10, distMeas=distEclud, createCent=randCent) print("centroids:", centroids) print("clusterAssment :", clusterAssment) print("clusterAssmentlengh :", len(clusterAssment)) ''' import os; import jieba; import numpy as np; from numpy import * import matplotlib.pyplot as plt import os def file_name(file_dir): filesname = [] for root, dirs, files in os.walk(file_dir): for file in files: filename = 'keywordfile/' + file filesname.append(filename) print("filesname length:", len(filesname)) return filesname def read_from_file(file_name): # 读取原文章 with open(file_name, "r", encoding='UTF8') as fp: words = fp.read() return words def stop_words(stop_word_file): words = read_from_file(stop_word_file) result = jieba.cut(words) new_words = [] for r in result: new_words.append(r) return set(new_words) def del_stop_words(words, stop_words_set): # words是已经切词但是没有去除停用词的文档。 # 返回的会是去除停用词后的文档 result = jieba.cut(words) new_words = [] for r in result: if r not in stop_words_set: new_words.append(r) return new_words def get_all_vector(file_path, stop_words_set): # names = [os.path.join(file_path, f) for f in os.listdir(file_path)] names = file_name('keyfile') posts = [open(name, encoding='utf-8').read() for name in names] docs = [] word_set = set() for post in posts: print('post', post) doc = del_stop_words(post, stop_words_set) docs.append(doc) word_set |= set(doc) # print len(doc),len(word_set) # print("word_Set:",word_set) # print("docs:", docs) word_set = list(word_set) docs_vsm = [] # for word in word_set[:30]: # print word.encode("utf-8"), for doc in docs: temp_vector = [] for word in word_set: temp_vector.append(doc.count(word) * 1.0) # print temp_vector[-30:-1] docs_vsm.append(temp_vector) docs_matrix = np.array(docs_vsm) print("docs_matrix:", docs_matrix) column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])] column_sum = np.array(column_sum) column_sum = docs_matrix.shape[0] / column_sum idf = np.log(column_sum) idf = np.diag(idf) # 注意一下计算都是矩阵运算,不是单个变量的运算。 for doc_v in docs_matrix: if doc_v.sum() == 0: doc_v = doc_v / 1 else: doc_v = doc_v / (doc_v.sum()) tfidf = np.dot(docs_matrix, idf) # return names, tfidf print("idf:", tfidf) f = "tezheng.txt" with open(f, "w", encoding='utf8') as file: # ”w"代表着每次运行都覆盖内容 for i in tfidf: for j in i: datafl = str(format(float(j), '.2f')) file.write(datafl + "\t") file.write("\n") def loadDataSet(fileName): dataSet = [] # 初始化一个空列表 fr = open(fileName) for line in fr.readlines(): # 切割每一行的数据 curLine = line.strip().split('\t') # 将数据追加到dataMat,映射所有的元素为 float类型 fltLine = list(map(float, curLine)) dataSet.append(fltLine) return mat(dataSet) def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作 for j in range(n):#在每个维度的边界内,创建簇心。 minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j]) - minJ) centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) return centroids def randCent(dataSet,k): m,n = dataSet.shape centroids = np.zeros((k,n)) for i in range(k): index = int(np.random.uniform(0,m)) # centroids[i,:] = dataSet[index,:] return centroids def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k, n))) # create centroid mat for j in range(n): # create random cluster centers, within bounds of each dimension minJ = min(dataSet[:, j]) rangeJ = float(max(dataSet[:, j]) - minJ) centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1)) return centroids def distEclud(vecA, vecB): return math.sqrt(sum(power(vecA - vecB, 2))) # dataSet样本点,k 簇的个数 # disMeas距离量度,默认为欧几里得距离 # createCent,初始点的选取 def K_means(dataSet,k,distMeas = distEclud,createCent = randCent): print("样本点:::",dataSet) m = shape(dataSet)[0]#样本数 print('样本数:',m) clusterAssment = mat(zeros((m,2)))#m*2的矩阵 centroids = createCent(dataSet,k)#初始化k个中心 clusterChanged = True while clusterChanged:#当聚类不再变化 clusterChanged = False for i in range(m): minDist = math.inf;minIndex = -1 for j in range(k):#找到最近的质心 distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI;minIndex = j if clusterAssment[i,0] !=minIndex:clusterChanged = True #第一列为所属质心,第二列为距离 clusterAssment[i,:] = minIndex,minDist**2 print(centroids) #更改质心位置 for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] centroids[cent,:] = mean(ptsInClust,axis=0) return centroids,clusterAssment def K_Means(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] # 样本数 clusterAssment = mat(zeros((m, 2))) # m*2的矩阵 centroids = createCent(dataSet, k) # 初始化k个中心 clusterChanged = True while clusterChanged: # 当聚类不再变化 clusterChanged = False for i in range(m): minDist = inf; minIndex = -1 for j in range(k): # 找到最近的质心 distJI = distMeas(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i, 0] != minIndex: clusterChanged = True # 第1列为所属质心,第2列为距离 clusterAssment[i, :] = minIndex, minDist ** 2 print(centroids) # 更改质心位置 for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] centroids[cent, :] = mean(ptsInClust, axis=0) return centroids, clusterAssment if __name__ == '__main__': wenzhang = read_from_file('input.txt') # print(wenzhang) wenzhang1 = stop_words('stopword.txt') # print(wenzhang1) wenzhang2 = del_stop_words(wenzhang, wenzhang1) # print(wenzhang2) wenzhang3 = get_all_vector('D:/Pycharm/项目存储/input/', wenzhang1) # kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent) dataSet = loadDataSet('tezheng.txt') centroids, clusterAssment = K_Means(dataSet, 3, distMeas=distEclud, createCent=randCent) print("centroids:", centroids) print("clusterAssment :", clusterAssment) print("clusterAssmentlengh :", len(clusterAssment)) '''