gensim ——训练word2vec词向量的使用方法。
# -*- coding: utf-8 -*- import os import time import sys reload(sys) sys.setdefaultencoding('utf-8') from gensim.models import word2vec def main(): # 原始语料路径,已分词 input_file = ur"sogou_seg.txt" sentences = word2vec.Text8Corpus(input_file)
#训练代码 model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=40) #save
model.save("./sogou_word2vec/min_count-1/sogou_word.model") model.wv.save_word2vec_format("./sogou_word2vec/min_count-1/sogou.wor2vec.txt") if __name__ == "__main__": main() print "Done!"
load 的时候只需要
model = word2vec.Word2Vec.load("./sogou_word2vec/min_count-1/sogou_word.model")
或者
model=gensim.models.KeyedVectors.load_word2vec_format("./sogou_word2vec/min_count-1/sogou.wor2vec.txt")