词向量聚类实验

实验描述：

　　本实验的目的是将词向量聚类并有效的表示。将要表示的词是从一个大规模语料中人工抽取出来的，部分所表示的词的示例如下：

　　　　家居：卫生间灯饰风格颇具匠心设计师沙发避风港枕头流连忘返奢华
　　　　房产：朝阳区物业房地产区域市场别墅廉租房经适房拆迁华润置地

步骤1：

　　首先进行分词，然后利用gensim工具训练词向量。

##### 分词

import jieba

src = 'cnews.train.txt'
tgt = 'train.txt'

jieba.load_userdict('dic.txt')

def _cws(seq):
　　seq_list = jieba.cut(seq)
　　return ' '.join(seq_list)

ft = open(tgt,'w',encoding='utf-8')
with open(src,'r',encoding='utf-8') as fl:
　　for line in fl.readlines():
　　　　line = line.strip('\n').split('\t')
　　　　seq = _cws(line[1])
　　　　ft.write(seq+'\n')

ft.close()

##### 训练词向量

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
import os.path
import sys
import multiprocessing

from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec, doc2vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
　　program = os.path.basename(sys.argv[0])
　　logger = logging.getLogger(program)

　　logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
　　logging.root.setLevel(level=logging.INFO)
　　logger.info("running %s" % ' '.join(sys.argv))

　　inp = "baike_final.txt"

　　　outp2 = "word_vector.src"

　　model = Word2Vec(LineSentence(inp), size=100, window=5, min_count=1,workers=multiprocessing.cpu_count(), sg=1, iter=5, negative=20)

　　
　　model.wv.save_word2vec_format(outp2, binary=False)

步骤2：对单词进行可视化

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import gensim
import matplotlib as mpl
import random

random.seed(0)
np.random.seed(0)

mpl.rcParams['font.sans-serif'] = ['FangSong']
mpl.rcParams['axes.unicode_minus'] = False

def plot_with_labels(low_dim_embs, labels, filename,flag=None):
　　assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
　　print('绘制词向量中......')
　　plt.figure(figsize=(10, 10)) # in inches

　　#length = np.linalg.norm(low_dim_embs, axis=1) ###对词向量进行标准化
　　#low_dim_embs = low_dim_embs / length[:, np.newaxis]

　　for i, label in enumerate(labels):
　　　　x, y = low_dim_embs[i, :]

　　plt.scatter(x, y,color=flag[i])
　　plt.annotate(label, xy=(x, y),xytext=(5, 2),textcoords='offset points',ha='right',va='bottom')
　　plt.savefig(filename)
　　plt.show()

def load_txt_vec(file, threshold=0, dtype='float'):
　　print('读取词向量文件中......')
　　header = file.readline().split(' ')
　　count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
　　dim = int(header[1])
　　words = []
　　matrix = np.empty((count, dim), dtype=dtype)
　　for i in range(count):
　　　　word, vec = file.readline().split(' ', 1)
　　　　words.append(word)
　　　　matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
　　return (words, matrix)

if __name__ == '__main__':
　　try:
　　　　w2v_txt_file = open('vec.txt', 'r',encoding='utf-8', errors='surrogateescape')
　　　　words, vectors = load_txt_vec(w2v_txt_file)
　
　　　　tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, m　　ethod='exact')
　　　　plot_only = 100

　　　　low_dim_embs = tsne.fit_transform(vectors[:plot_only])
　　　　labels = [words[i] for i in range(plot_only)]
　　　　colors = ["red", "blue", "yellow", "green", "black", "purple", "pink", "grey", "brown", "orange"]
　　　　plot_with_labels(low_dim_embs, labels, '1w2v.png', flag=sorted(colors* 10))

　　except ImportError as ex:
　　　　print('Please install gensim, sklearn, numpy, matplotlib, and scipy to show embeddings.')
　　　　print(ex)

其显示结果如图所示：

归一化之后的结果为：

在进行词表示的时候，取模是将词向量的长度进行归一化，取模之后进行显示，考虑的仅仅是词于词之间的语义关系。

步骤3：对词进行聚类

import json
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import random

random.seed(1)

src = 'vec.json'
tgt = 'classification.json'

labels = ['0','1','2','3','4','5','6','7','8','9']
flag=sorted(labels * 10)

features = []
with open(src,'r',encoding='utf-8') as fl:
　　for line in fl.readlines():
　　　　line = line.strip('\n')
　　　　line = json.loads(line)
　　　　for ss in line.items():
　　　　　　features.append(ss[1])

def fun(features):
　　temp = []
　　for ss in features:
　　　　cmp = []
　　　　for s in ss:
　　　　　　a = eval(s)
　　　　　　cmp.append(a)
　　　　temp.append(cmp)
　　return temp

score =[]
features = fun(features)
clf = KMeans(n_clusters=10,max_iter=10000,n_init = 10)

pred = clf.fit_predict(features)
with open(src,'r',encoding='utf-8') as fl:
　　with open(tgt,'w',encoding='utf-8') as ft:
　　　　fl = fl.readlines()
　　　　for i in range(10):
　　　　　　for e,flag in enumerate(pred):
　　　　　　　　tmp = []
　　　　　　　　if flag == i:
　　　　　　　　　　line = fl[e].strip('\n')
　　　　　　　　　　line = json.loads(line)
　　　　　　　　　　tmp.append(str(flag))
　　　　　　　　　　tmp.append(line)

　　　　　　　　　　json_data = json.dumps(tmp,ensure_ascii=False)
　　　　　　　　　　ft.write(json_data+'\n')

可视化结果为：

对聚类结果的评测指标：

首先计算RI指数：

import json

src = 'classification.json'
tgt = 'tgt.json'

cmp = [i for i in range(10)]

with open(src,'r',encoding='utf-8') as fl:
　　with open(tgt,'w',encoding='utf-8') as ft:
　　　　fl = fl.readlines()
　　　　dic ={}
　　　　for i in cmp:
　　　　　　temp = []
　　　　　　#print(i)
　　　　　　for e in range(len(fl)):
　　　　　　　　line = fl[e]
　　　　　　　　line = line.strip('\n')
　　　　　　　　line = json.loads(line)
　　　　　　　　if eval(line[0]) == i:
　　　　　　　　　　#temp.append(i)
　　　　　　　　　　for ss in line[1].items():
　　　　　　　　　　　　temp.append(ss[0])
　　　　　　dic[i] = temp
　　　　json_data = json.dumps(dic,ensure_ascii=False)
　　　　ft.write(json_data+'\n')
print(dic)

将数据处理成这种可以查询的形式：
　　{0: ['朝阳区', '物业', '区域', '别墅', '廉租房', '经适房', '拆迁', '开盘'],
 　　1: ['风衣', '搭配', '超短裙', '西装', '打底裤', '传感器', '高跟鞋'],
　　 2: ['留学生', '高校', '记者', '媒体', '国防部', '新华网', '卫生部', '探测器', '宇航员', '空间站'], 
　　3: ['电视剧', '导演', '舞台剧', '戛纳', '好莱坞', '微积分', '博士', '哈佛', '耶鲁', '新东方', '论文']}

src1 = 'ss.txt'
lb_dic = {}

with open(src1,'r',encoding='utf-8') as fl:
　　for line in fl.readlines():

　　line = line.strip('\n').split(' ')
　　for i in range(1,len(line)):
　　　　lb_dic[line[i]]=line[0]

首先计算TP的值：

def _tp_judge(ss):
　　TP = 0
　　for i in range(len(ss)):
　　　　for j in range(i,len(ss)):

　　　　　　if lb_dic[ss[i]] == lb_dic[ss[j]]:
　　　　　　TP+=1
　　　return TP

tp_num = 0
for ww in dic.items():
　　ss = ww[1]
　　a = _tp_judge(ss)
　　tp_num+=a

计算FP的值：

def _fp_judge(ss):
　　FP = 0
　　for i in range(len(ss)):
　　　　for j in range(i,len(ss)):

　　　　　　if lb_dic[ss[i]] != lb_dic[ss[j]]:
　　　　　　　　FP+=1
　　print(FP)
　　return FP

fp_num = 0
for ww in dic.items():
　　ss = ww[1]
　　a = _fp_judge(ss)
　　fp_num+=a

计算FN的值：

cmp = [i for i in range(10)]

with open(src,'r',encoding='utf-8') as fl:
　　fl = fl.readlines()
　　fn_dic ={}
　　for i in cmp:
　　　　temp = []
　　　　#print(i)
　　　　for e in range(len(fl)):
　　　　　　line = fl[e]
　　　　　　line = line.strip('\n')
　　　　　　line = json.loads(line)
　　　　　　if eval(line[0]) == i:
　　　　　　　　#temp.append(i)
　　　　　　　　for ss in line[1].items():
　　　　　　　　　　fn_dic[ss[0]]=i
print(fn_dic)

将数据处理成如下形式：

{'朝阳区': 0, '物业': 0, '区域': 0, '别墅': 0, '廉租房': 0,
 '经适房': 0, '拆迁': 0, '开盘': 0, '风衣': 1, '搭配': 1, '超短裙': 1, 
'西装': 1, '打底裤': 1, '传感器': 1, '高跟鞋': 1, '留学生': 2, '高校': 2, 
'记者': 2, '媒体': 2, '国防部': 2, '新华网': 2, '卫生部': 2, '探测器': 2,
 '宇航员': 2, '空间站': 2, '电视剧': 3, '导演': 3, '舞台剧': 3, '戛纳': 3}

src1 = 'ss.txt'
fn_lb_dic = {}

with open(src1,'r',encoding='utf-8') as fl:
　　for line in fl.readlines():
　　cmp = []
　　line = line.strip('\n').split(' ')
　　for i in range(1,len(line)):
　　　　cmp.append(line[i])
　　fn_lb_dic[line[0]]=cmp
print(fn_lb_dic)

将数据处理成如下形式：

{'体育：': ['高尔夫', '奥运会', '振奋人心', '遥遥领先', '绝杀', '篮板', '运动员', '火箭', '湖人', '艾弗森'],
 '娱乐：': ['爱心大使', '电视剧', '导演', '惊心动魄', '舞台剧', '百花奖', '戛纳', '众志成城', '票房', '好莱坞'],
 '家居：': ['卫生间', '灯饰', '风格', '颇具匠心', '设计师', '沙发', '避风港', '枕头', '流连忘返', '奢华']}

def _fn_judge(ss):
　　FN = 0
　　for i in range(len(ss)):
　　　　for j in range(i,len(ss)):
　　　　　　if fn_dic[ss[i]] != fn_dic[ss[j]]:
　　　　　　　　FN+=1
　　print(FN)
　　return FN

fn_num = 0
for ww in fn_lb_dic.items():
　　ss = ww[1]
　　a = _fn_judge(ss)
　　fn_num+=a

计算RI的值：

num = (100*99)/2
print(num)
tn_num = num-(fn_num+fp_num+tp_num)

RI = (tn_num+tp_num)/num
print(RI)

计算Purity

def _judge_label(param):
    pass

## 返回一共多少个类别
count = 0
## dic是聚类之后生成的簇的字典，return final_
for ss in dic.items():
    _dic = {}
    #print(ss)
    cmp = ss[1]
    for ww in cmp:
        lb = lb_dic[ww]
        if lb not in _dic:
            _dic[lb] = 1
        else:
            _dic[lb] +=1
    _dic = sorted(_dic.items(),key = lambda item:item[1],reverse=True)
    print(_dic)
    #print(_dic[0][1])
    count += _dic[0][1]
print(count/100)

注：ss.txt文件中的数据格式示例：

　　体育：高尔夫奥运会振奋人心遥遥领先绝杀篮板运动员火箭湖人艾弗森
　　娱乐：爱心大使电视剧导演惊心动魄舞台剧百花奖戛纳众志成城票房好莱坞

classification.json文件中的数据格式示例：

["0", {"朝阳区": ["-0.1384889", "-0.20243359", "0.102069065", "0.23184149", "-0.046311297", "0.14353138", "0.5543517", "0.29026467", "0.45963597", "-0.17217028", "-0.46954596", "0.31719774", "0.17761867", "0.29701278", "0.6938801", "-0.14563806", "-0.42253557", "0.19530635", "-0.04560669", "-0.26439634", "0.4578489", "-0.16404046", "-0.80666643", "-0.38364044", "-0.45906314", "-0.3663307", "0.45213446", "0.14570023", "0.65608627", "-0.07094579", "-0.08269144", "0.04528637", "-0.23720852", "-0.31091192", "0.14366734", "-0.14827731", "0.16437379", "-0.7352608", "0.05033749", "-0.9513891", "0.9870362", "-0.12723659", "-0.8449314", "-0.6920033", "-0.7977017", "0.7917335", "-0.16021581", "-0.36755514", "0.34619418", "-0.25134012", "0.2761363", "1.1556392", "0.29476222", "-0.41960797", "0.56152135", "-0.6300105", "0.1123949", "0.0011835459", "0.722344", "0.17237052", "-0.67563206", "-0.4982514", "0.95673597", "-0.25812265", "-0.46021804", "-0.2898547", "0.28434277", "0.39801842", "0.7448203", "0.12801662", "-0.34011438", "-0.24242696", "-0.3204678", "1.298795", "-0.4462184", "-0.060344703", "0.60382426", "-0.18342693", "-0.096174486", "-0.6355714", "-0.17171651", "0.10724717", "0.5793009", "-0.29091367", "0.13196644", "0.6097893", "-0.26113376", "-0.7714975", "0.4343755", "-0.41631258", "-0.08117525", "-0.41429466", "0.24792041", "0.46800923", "0.24307102", "-0.2998296", "-0.05571826", "0.033080667", "0.12935354", "-0.6335654"]}]
["0", {"物业": ["0.21649252", "0.37936735", "-0.27688688", "-0.16372944", "0.33023095", "0.1948151", "1.0928497", "0.17396215", "0.6366656", "-0.53761625", "0.15351042", "-0.6486163", "-0.28322187", "-0.124383435", "0.028312065", "-0.10628245", "-0.3067216", "-0.571357", "-0.3424585", "-0.17725593", "0.057400767", "-0.30003172", "-0.65997154", "0.09322061", "0.2592893", "-0.2973807", "0.09047743", "0.15993215", "0.31192315", "-0.22998796", "0.39004174", "0.7176821", "-0.19427699", "-0.6698311", "-0.88420814", "0.27445787", "0.33690578", "-0.7266952", "-0.34795922", "0.21389268", "0.6480503", "0.60902315", "-0.13365382", "-0.2836483", "-0.33640364", "0.040157873", "-0.69254637", "-0.07615123", "0.21742567", "-0.8085353", "0.3357688", "0.42005894", "0.019965837", "-0.6346853", "0.4982129", "-0.73431104", "-0.50731987", "0.38148776", "-0.4835083", "-0.17213771", "-0.25769734", "0.32378343", "0.6043039", "-0.3443045", "-0.43060115", "-0.32943606", "0.22752339", "0.14163403", "0.75646776", "-0.0654474", "0.24144651", "0.26645514", "-0.071194924", "0.38203463", "-0.08941716", "0.0788296", "0.19273318", "0.03026876", "-0.36909038", "-0.5176391", "-0.27326223", "-0.099675916", "-0.016073491", "-0.101594515", "0.2737673", "0.594221", "-0.35780343", "-0.5114833", "-0.2103526", "0.12272796", "-0.3582389", "-0.31339905", "-0.048890725", "0.37520337", "0.20995434", "0.27567548", "-0.3183832", "-0.13981616", "0.55548304", "-0.8013473"]}]

posted @ 2020-07-05 20:43 胡~萝~卜阅读(2273) 评论(1) 编辑收藏举报

刷新页面返回顶部

Dylan~Han

词向量聚类实验

实验描述：

步骤1：

步骤2：对单词进行可视化

步骤3：对词进行聚类

公告