Python Word2Vec训练和测试词向量
train_word2vec_model.py:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1,
workers=multiprocessing.cpu_count())
#window:skip-gram通常在10附近,CBOW通常在5附近
#hs: 如果为1则会采用hierarchica softmax技巧。如果设置为0(defaut),则negative sampling会被使用。
# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
执行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可训练词向量
train_word2vec_model.py为训练词向量的程序代码,v6_EN.txt是我训练的语料库的名称,v6_EN.model为我训练出来的词向量模型名称,v6_EN.vector为格式化保存词向量模型的文件(一般用不到,但执行语句必须包含这一项)
训练出的文件有五个:
中间3个.npy文件在load词向量模型时都必须和v6_EN.model放在同一文件夹下
In [1]: import gensim In [2]: model = gensim.models.Word2Vec.load("v6_EN.model") In [3]: result = model.most_similar("足球") In [4]: for e in result: print e[0], e[1] ....: 联赛 0.65538161993 甲级 0.653042972088 篮球 0.596754670143 俱乐部 0.587228953838 乙级 0.58406317234 足球队 0.556015253067 亚足联 0.530800580978 allsvenskan 0.52497625351 代表队 0.521494746208 甲组 0.51778960228
test.py:
import gensim
import numpy as np
import xlwt
model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model")
model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model")
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('Result')
Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy")
test = np.load("GT/test1000EN-FR.npy")
font1 = xlwt.Font()
font1.height=0x00E8
font1.name = '宋体'
style1 = xlwt.XFStyle()
style1.font = font1
worksheet.write(0, 0, label = '英文测试单词', style = style1)
worksheet.col(0).width = 3333
worksheet.write(0, 1, label = '预测的法语译文', style = style1)
worksheet.col(1).width = 4000
worksheet.write(0, 2, label = '词典给出的法语译文', style = style1)
worksheet.col(2).width = 4400
worksheet.write(0, 3, label = '对错', style = style1)
worksheet.col(3).width = 4400
num = 0
true_Word=0.0
while num < 1000:
word_EN = test[num][0]
word_FR = test[num][1]
vec_Test = model_EN.wv[word_EN]
vec_Test.shape = (1,800)
b = np.dot(vec_Test,Thta)
b.shape = (200,)
e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None)
print(e[0][0])
worksheet.write(num+1, 0, label = word_EN)
worksheet.write(num+1, 1, label = [e[k][0]+' ' for k in range(5)])
worksheet.write(num+1, 2, label = word_FR)
for i in range(5):
if e[i][0] == word_FR:
worksheet.write(num+1, 3, label = '✔️')
true_Word+=1
break
elif i == 4:
worksheet.write(num+1, 3, label = '×')
print('测试完成%d个单词'%(num+1))
num += 1
worksheet.write(num+1, 0, label = '正确率', style = style1)
worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%')
print(str(true_Word/num*100)+'%')
workbook.save('GT/test/testEN-FR/Thta0.07/EN-FR0.07@5_7000.xls')