K-means 不知k值 自动无监督分类
代码:
1 # -*- coding:UTF-8 -*- 2 from numpy import * 3 import jieba as jb 4 import time 5 # 计算权值,并存储为txt 6 # 计算所有文本包含的总词数 7 def wordsCount(dataSet): 8 wordsCnt = 0 9 for document in dataSet: 10 wordsCnt += len(document) 11 return wordsCnt 12 13 # 创建不重复的词条列表 14 def createVocabList(dataSet): 15 vocabSet = set([]) 16 for document in dataSet: 17 vocabSet = vocabSet | set(document) 18 return list(vocabSet) 19 20 # 将文本转化为词袋模型 21 def bagOfWords2Vec(vocabList, inputSet): 22 returnVec = [0] * len(vocabList) 23 for word in inputSet: 24 if word in vocabList: 25 returnVec[vocabList.index(word)] += 1 26 else: 27 print("the word: %s is not in my Vocabulary!" % word) 28 return returnVec 29 30 # 计算包含某个词的文本数 31 def wordInFileCount(word, cutWordList): 32 fileCnt = 0 33 for i in cutWordList: 34 for j in i: 35 if word == j: 36 fileCnt = fileCnt + 1 37 else: 38 continue 39 return fileCnt 40 41 def calTFIDF(dataSet): 42 fileCnt = len(dataSet) # 文本数 43 vocabList = createVocabList(dataSet) # 词条列表 44 tfidfSet = [] 45 46 for line in dataSet: 47 wordsBag = bagOfWords2Vec(vocabList, line) # 每行文本对应的词袋向量 48 lineWordsCnt = 0 49 for i in range(len(wordsBag)): 50 lineWordsCnt += wordsBag[i] # 计算每个文本中包含的总词数 51 tfidfList = [0] * len(vocabList) 52 for word in line: 53 wordinfileCnt = wordInFileCount(word, dataSet) # 包含该词的文本数 54 wordCnt = wordsBag[vocabList.index(word)] # 该词在文本中出现的次数 55 tf = float(wordCnt) / lineWordsCnt 56 idf = math.log(float(fileCnt) / (wordinfileCnt + 1)) 57 tfidf = tf * idf 58 tfidfList[vocabList.index(word)] = tfidf 59 print(tfidfList) 60 print(map(str, tfidfList)) 61 tfidfSet.append(tfidfList) 62 63 return tfidfSet 64 65 # 计算余弦距离 66 def gen_sim(A, B): 67 num = float(dot(mat(A), mat(B).T)) 68 denum = linalg.norm(A) * linalg.norm(B) 69 if denum == 0: 70 denum = 1 71 cosn = num / denum 72 sim = 0.5 + 0.5 * cosn # 余弦值为[-1,1],归一化为[0,1],值越大相似度越大 73 sim = 1 - sim # 将其转化为值越小距离越近 74 return sim 75 76 77 # 计算两个簇的评均距离 78 def distAvg(dataSet1, dataSet2): 79 avgD = 0 80 sumD = 0 81 m = shape(dataSet1)[0] 82 n = shape(dataSet2)[0] 83 for i in range(m): 84 for j in range(n): 85 dist = gen_sim(dataSet1[i], dataSet2[j]) 86 sumD += dist 87 avgD = sumD / (m * n) 88 return avgD 89 90 # 找到距离最近的两个簇 91 def findMin(M): 92 minDist = inf 93 m = shape(M)[0] 94 for i in range(m): 95 for j in range(m): 96 if i != j and M[i, j] < minDist: 97 minDist = M[i, j] 98 minI = i 99 minJ = j 100 return minI, minJ, minDist 101 102 103 # 层次聚类算法 104 def hCluster(dataSet, k, dist, distMeas=distAvg): 105 m = shape(dataSet)[0] 106 clusterAssment = mat(zeros((m, 1))) 107 performMeasure = [] 108 M = mat(zeros((m, m))) # 距离矩阵 109 # 初始化聚类簇,每个样本作为一个类 110 for ii in range(m): 111 clusterAssment[ii, 0] = ii 112 113 for i in range(m): 114 for j in range(i + 1, m): 115 dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :] 116 dataSetj = dataSet[nonzero(clusterAssment[:, 0].A == j)[0], :] 117 M[i, j] = distMeas(dataSeti, dataSetj) 118 M[j, i] = M[i, j] 119 if mod(i,10) == 0: print(i) 120 q = m # 设置当前聚类个数 121 minDist = 0 122 # while (q > k): 123 while (minDist < dist): 124 i, j, minDist = findMin(M) # 找到距离最小的两个簇 125 # 把第j个簇归并到第i个簇 126 clusterAssment[nonzero(clusterAssment[:, 0].A == j)[0], 0] = i 127 for l in range(j + 1, q): # 将j之后的簇重新编号 128 clusterAssment[nonzero(clusterAssment[:, 0].A == l)[0], 0] = l - 1 129 M = delete(M, j, axis=0) 130 M = delete(M, j, axis=1) 131 for l in range(q - 1): # 重新计算第i个簇和其他簇直接的距离 132 dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :] 133 dataSetl = dataSet[nonzero(clusterAssment[:, 0].A == l)[0], :] 134 M[i, l] = distMeas(dataSeti, dataSetl) 135 M[l, i] = M[i, l] 136 137 # DBI = DBIvalue(dataSet, clusterAssment, q) 138 # DI = DIvalue(dataSet, clusterAssment, q) 139 DBI = 0 140 DI = 0 141 142 performMeasure.append([q - 1, minDist, DBI, DI]) 143 144 q = q - 1 145 146 print(u'当前簇的个数是:', q) 147 print(u'距离最小的两个簇是第%d个和第%d个,距离是%f,DBI值是%f,DI值是%f' % ( 148 i, j, minDist, DBI, DI)) 149 150 return clusterAssment, mat(performMeasure) 151 152 def saveResult(clusterAssment): 153 listResult = clusterAssment.tolist() # 矩阵转换为list 154 for i in range(len(listResult)): 155 print(map(str, listResult[i])) 156 157 158 if __name__ =='__main__': 159 a=["实施", "效益","节本","10"] 160 m=mat(calTFIDF(a)) 161 clustAssing, performMeasure = hCluster(m, 0, 0.3) 162 print(clustAssing) 163 saveResult(clustAssing)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· C#/.NET/.NET Core技术前沿周刊 | 第 29 期(2025年3.1-3.9)
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异