特征模型2

1、word2vec模型

  为一款将词表表征为实数值向量的工具,输出的词向量可以被用来做词性分析,找同义词等

  word2vec源码(gensim)

  测试代码

def simple_example():
sentences = [['first', 'sentence'], ['second', 'sentence']]
model = gensim.models.Word2Vec(size = 100, window = 8, min_count = 10 ,iter = 10)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs = model.iter)
print(model['first'])

  初始化

  def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
max_final_vocab=None):
   self.max_final_vocab = max_final_vocab

  self.callbacks = callbacks
   self.load = call_on_class_only

  self.wv = Word2VecKeyedVectors(size)
  self.vocabulary = Word2VecVocab(
    max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab),
     null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent)
  self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)

  super(Word2Vec, self).__init__(
  sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter,
  callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window,
  seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
  fast_version=FAST_VERSION)
  关键属性:
    wv主要用于
    vocabulary主要用于扫描vocab,准备vocab
    trainables主要用于。。。
    sample对高频词进行随机降采样???

  构建词汇表
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000,
keep_raw_vocab=False, trim_rule=None, **kwargs):        ########################sentences 为list的list; corpus_file代替sentences使用; update设置是否更新,如果是,则没有出现的新词会加到模型的词汇里?;progress?
total_words, corpus_count = self.vocabulary.scan_vocab(
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) #########################调用初始化好的vocabulary实例, 计算好了 self.raw_vocab(defaultdict(<class 'int'>, {'first': 1, 'sentence': 2, 'second': 1})),另外返回total_words为所有样本总单词数4, corpus_count为行数/样本数2
self.corpus_count = corpus_count            #################设置样本数2
self.corpus_total_words = total_words          ######################设置所有样本总单词数4
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, **kwargs)                                              ###########################1、根据min_count对raw_vocab中单词的次数进行过滤。2、调用初始化好的vocabulary实例,用上一步计算的raw_vocab值刷新wv的vocab值(<class 'dict'>: {'first': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E939470>, 'sentence': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E4D0438>, 'second': <gensim.models.keyedvectors.Vocab object at 0x0000022F7E947F28>})和index2word的值(<class 'list'>: ['first', 'sentence', 'second'])
                                                                   ########################### 其中值为Vocab(count:5, index:0)  3、按照单词的次数对wv中的index2word进行排序和修改wv的vocab字典中的Vocab的index值为排序后的值

   report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])        ########################## 计算内存使用量(<class 'dict'>: {'vocab': 1500, 'vectors': 1200, 'syn1neg': 1200, 'total': 3900})
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary)      ######################### 准备参数 1、初始化空的wv的vectors,大小为3 * 100(唯一单词个数 * 初始化传入的size),即隐层参数 2、随机初始化步骤1参数  
posted @ 2020-12-22 20:05  哈哈哈喽喽喽  阅读(171)  评论(0编辑  收藏  举报