Gensim的model使用word2vec 示例
# coding=utf-8 """ @ File: word2vec_gensim.py @Software: PyCharm @desc: """ from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) raw_sentences = ['the quick brown fox jumps over ther lazy dogs', 'yoyoyo you go home now to sleep'] sentences = [s.split() for s in raw_sentences] print(sentences) # out: [['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'ther', 'lazy', 'dogs'], ['yoyoyo', 'you', 'go', 'home', 'now', 'to', 'sleep']] # 传参是文章分词后的列表,每篇文章一个元素 model = word2vec.Word2Vec(sentences, min_count=1) model.wv.save('m2.mdl') # 或者 model.save('m1.mdl') # 加载使用模型 md = word2vec.Word2Vec.load('m1.mdl') # 用于比较单个词语 print(md.similarity('dogs', 'you')) # out: -0.06432766 # wv是4.0新版本后的方法,代替model.n_similartity # n_similarity用于比较文章 print(md.wv.n_similarity(['fox','dogs'], ['dogs', 'fox'])) # out:1.0
# most_similar找到相似度最高的词
word = 'dogs'
# 如果 word在词向量词库中
if word in model.wv.index2word:
print(model.most_similar('dogs'))
else:
print(word + ' is not in model')
# 打印出词向量库中的所有词
print(model.wv.index2word)
if model.__contains__(word):
print(word + " is in model")
ref : https://blog.csdn.net/luoluonuoyasuolong/article/details/107810578