Python Word2Vec训练和测试词向量

train_word2vec_model.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
import logging
import os
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
 
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
 
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
 
    # check and process input arguments
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]
 
    model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1,
                     workers=multiprocessing.cpu_count())
    #window:skip-gram通常在10附近,CBOW通常在5附近
    #hs: 如果为1则会采用hierarchica softmax技巧。如果设置为0(defaut),则negative sampling会被使用。
    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

执行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可训练词向量

train_word2vec_model.py为训练词向量的程序代码,v6_EN.txt是我训练的语料库的名称,v6_EN.model为我训练出来的词向量模型名称,v6_EN.vector为格式化保存词向量模型的文件(一般用不到,但执行语句必须包含这一项)

训练出的文件有五个:

中间3个.npy文件在load词向量模型时都必须和v6_EN.model放在同一文件夹下

In [1]: import gensim
 
In [2]: model = gensim.models.Word2Vec.load("v6_EN.model")
 
In [3]: result = model.most_similar("足球")
 
In [4]: for e in result:
    print e[0], e[1]
   ....:     
联赛 0.65538161993
甲级 0.653042972088
篮球 0.596754670143
俱乐部 0.587228953838
乙级 0.58406317234
足球队 0.556015253067
亚足联 0.530800580978
allsvenskan 0.52497625351
代表队 0.521494746208
甲组 0.51778960228

 test.py:

import gensim
import numpy as np
import xlwt
model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model")
model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model")
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('Result')
Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy")
test = np.load("GT/test1000EN-FR.npy")
font1 = xlwt.Font()
font1.height=0x00E8
font1.name = '宋体'
style1 = xlwt.XFStyle()
style1.font = font1
worksheet.write(0, 0, label = '英文测试单词', style = style1)
worksheet.col(0).width = 3333
worksheet.write(0, 1, label = '预测的法语译文', style = style1)
worksheet.col(1).width = 4000
worksheet.write(0, 2, label = '词典给出的法语译文', style = style1)
worksheet.col(2).width = 4400
worksheet.write(0, 3, label = '对错', style = style1)
worksheet.col(3).width = 4400
num = 0
true_Word=0.0
while num < 1000:
    word_EN = test[num][0]
    word_FR = test[num][1]
    vec_Test = model_EN.wv[word_EN]
    vec_Test.shape = (1,800)
    b = np.dot(vec_Test,Thta)
    b.shape = (200,)
    e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None)
    print(e[0][0])
    worksheet.write(num+1, 0, label = word_EN)
    worksheet.write(num+1, 1, label = [e[k][0]+'  ' for k in range(5)])
    worksheet.write(num+1, 2, label = word_FR)
    for i in range(5):
        if e[i][0] == word_FR:
            worksheet.write(num+1, 3, label = '✔️')
            true_Word+=1
            break
        elif i == 4:
            worksheet.write(num+1, 3, label = '×')
    print('测试完成%d个单词'%(num+1))
    num += 1

worksheet.write(num+1, 0, label = '正确率', style = style1)
worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%')
print(str(true_Word/num*100)+'%')
workbook.save('GT/test/testEN-FR/Thta0.07/EN-FR0.07@5_7000.xls')

 

posted @ 2018-03-21 10:02  谢育欣  阅读(8774)  评论(0编辑  收藏  举报