K-means 基本流程 Demo

也是单纯搬个砖, 记个笔记, K-Means 最近是有在用的, 当然之前也有用的, 也是掉包来弄的, 已经很少会去自己写了, 这里的目的, 也是为了自己, 后面再遇到可以复制粘贴.
对, 情况就是这样, 搬砖达人就是我, 非常稳, 尽量不生成不合规的代码, 直接站在前人的基础来修改即可, 哈哈, 工作常态也是, 我感觉.

import sys
import re

import gensim
import numpy as np
import my_jieba
from gensim.models import word2vec
from sklearn.cluster import KMeans

TaggededDocument = gensim.models.doc2vec.TaggedDocument


def get_datasest():
    with open("source.txt", 'r') as f:
        content = f.readlines() # 返回列表, 每行
        print(len(content))

    x_train = []
    # y = np.concatenate(np.ones(len(docs)))
    for i, text in enumerate(content):
        words = my_jieba.cut(text)
        x_train.append(words)

    return x_train


def train(x_train, size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
    model_dm.save('w2v_02.model')

    return model_dm


def cluster(x_train):
    infered_vectors_list = []
    print("load doc2vec model...")
    model_dm = Doc2Vec.load("model/model_dm")
    print("load train vectors...")
    i = 0
    for text, label in x_train:
        vector = model_dm.infer_vector(text)
        infered_vectors_list.append(vector)
        i += 1

    print("train kmean model...")
    kmean_model = KMeans(n_clusters=15)
    kmean_model.fit(infered_vectors_list)
    labels = kmean_model.predict(infered_vectors_list[0:100])
    cluster_centers = kmean_model.cluster_centers_

    with open("out/own_claasify.txt", 'w') as wf:
        for i in range(100):
            string = ""
            text = x_train[i][0]
            for word in text:
                string = string + word
            string = string + '\t'
            string = string + str(labels[i])
            string = string + '\n'
            wf.write(string)

    return cluster_centers


if __name__ == '__main__':
    x_train = get_datasest()
    model_dm = train(x_train)
    cluster_centers = cluster(x_train)
posted @ 2020-06-17 23:47  致于数据科学家的小陈  阅读(527)  评论(0编辑  收藏  举报