NLP---word2vec的python实现

import logging
from gensim.models import word2vec
import multiprocessing

# 配置日志
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)


# 评论的二维数组,其中的一维数组是每条评论分词之后的list
sentences = [['电池', '续航', '不错'], ['手机', '电池', '容量', '大']]

# 模型路径
model_path = '../model/study_w2v'

# 词向量训练, 并保存模型
model = word2vec.Word2Vec(size=128, min_count=3, window=5, workers=multiprocessing.cpu_count(), sg=1)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
model.save(model_path)

# 加载模型
model = word2vec.Word2Vec.load(model_path)

# 查看模型的词表中词 频度和索引
for key, value in model.wv.vocab.items():       # model.wv.vocab是一个dict
    print(key, value)

# 判断模型词表中是否存在某个词
print('手机' in model.wv.vocab)

# 获得模型中词表的词数
print(len(model.wv.vocab))

# 获取模型中的语料数
print(model.corpus_count)

# 获取词向量的维度
print(model.wv.vector_size)

# # 获取某个词的词向量, 先判断再获取
noun1 = '手机'
if noun1 in model.wv.vocab:
    print(model.wv[noun1])


# 计算两个词的相似度(余弦距离),结果越大越相似
noun2 = '电池'
noun3 = '电量'
noun4 = '续航'

print(model.wv.similarity(noun1, noun2))
print(model.wv.similarity(noun3, noun2))
print(model.wv.similarity(noun4, noun2))

# 计算两个词的距离, 结果越大越不相似, 1-similarity
print(model.wv.distance(noun1, noun2))
print(model.wv.distance(noun3, noun2))
print(model.wv.distance(noun4, noun2))

# 取给定词最相近的topn个词
print(model.wv.most_similar(noun1))


# 找出与其他词差异最大的词
print(model.wv.doesnt_match([noun1, noun2, noun3, noun4]))

# 增量训练, 增加预料在原来模型的基础上训练
new_sentences = [['手机', '拍照', '效果', '不错'], ['相机', '美颜', '效果', '好']]
model.build_vocab(sentences=new_sentences, update=True)
model.train(sentences=new_sentences, total_examples=model.corpus_count, epochs=model.iter)
model.save(model_path)
posted @ 2020-11-10 17:42  凯旋.Lau  阅读(341)  评论(0编辑  收藏  举报