特征模型2
1、word2vec模型
为一款将词表表征为实数值向量的工具,输出的词向量可以被用来做词性分析,找同义词等
word2vec源码(gensim)
测试代码
def simple_example():
sentences = [['first', 'sentence'], ['second', 'sentence']]
model = gensim.models.Word2Vec(size = 100, window = 8, min_count = 10 ,iter = 10)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs = model.iter)
print(model['first'])
初始化
def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
max_final_vocab=None):
self.max_final_vocab = max_final_vocab
self.callbacks = callbacks
self.load = call_on_class_only
self.wv = Word2VecKeyedVectors(size)
self.vocabulary = Word2VecVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab),
null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent)
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
super(Word2Vec, self).__init__(
sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter,
callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window,
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
fast_version=FAST_VERSION)
关键属性:
wv主要用于
vocabulary主要用于扫描vocab,准备vocab
trainables主要用于。。。
sample对高频词进行随机降采样???
构建词汇表
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000,
keep_raw_vocab=False, trim_rule=None, **kwargs): ########################sentences 为list的list; corpus_file代替sentences使用; update设置是否更新,如果是,则没有出现的新词会加到模型的词汇里?;progress?
total_words, corpus_count = self.vocabulary.scan_vocab(
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) #########################调用初始化好的vocabulary实例, 计算好了 self.raw_vocab(defaultdict(<class 'int'>, {'first': 1, 'sentence': 2, 'second': 1})),另外返回total_words为所有样本总单词数4, corpus_count为行数/样本数2
self.corpus_count = corpus_count #################设置样本数2
self.corpus_total_words = total_words ######################设置所有样本总单词数4
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, **kwargs) ###########################1、根据min_count对raw_vocab中单词的次数进行过滤。2、调用初始化好的vocabulary实例,用上一步计算的raw_vocab值刷新wv的vocab值(<class 'dict'>: {'first': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E939470>, 'sentence': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E4D0438>, 'second': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E947F28>})和index2word的值(<class 'list'>: ['first', 'sentence', 'second'])
########################### 其中值为Vocab(count:5, index:0) 3、按照单词的次数对wv中的index2word进行排序和修改wv的vocab字典中的Vocab的index值为排序后的值
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) ########################## 计算内存使用量(<class 'dict'>: {'vocab': 1500, 'vectors': 1200, 'syn1neg': 1200, 'total': 3900})
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) ######################### 准备参数 1、初始化空的wv的vectors,大小为3 * 100(唯一单词个数 * 初始化传入的size),即隐层参数 2、随机初始化步骤1参数