Embedding和Word2Vec用法
Embedding
Embedding 层的输入是一个二维整数张量, 形状为(samples,sequence_length),即(样本数,序列长度)
较短的序列应该用 0 填充,较长的序列应该被截断,保证输入的序列长度是相同的
Embedding 层输出是(samples,sequence_length,embedding_dimensionality) 的三维浮点数张量。
- 首先,我们需要对文本进行分词处理,然后对分词结果进行序列化
- 再统一输入的序列长度,最后把统一长度的序列化结果输入到 Embedding 层中
整个过程可以用下面的图描述:
从样本的角度看,我们可以用下面的图描述这个过程:
gensim库提供了一个word2vec的实现,我们使用几个API就可以方便地完成word2vec
gensim实现Word2Vec
示意代码如下:
from torch import nn from gensim.models import Word2Vec # w2v模型位置 w2v_path = "model/w2v_all.model" class Preprocess(): def __init__(self,sentences,sen_len,w2v_path=w2v_path): self.w2v_path = w2v_path self.sentences = sentences self.sen_len = sen_len self.idx2word = [] self.word2idx = {} self.embedding_matrix = [] def get_w2v_model(self): # 把之前保存的w2v模型加载进来 self.embedding = Word2Vec.load(self.w2v_path) self.embedding_dim = self.embedding.vector_size def add_embedding(self,word): # 把word加进embedding 并赋值一个随机向量 vector = torch.empty(1,self.embedding_dim) #return a tensor filled with uninitialed dada. shape is (1*embedding_dim) #从均匀分布U(a,b)中生成值,填充参数vector,默认a=0,b=1 torch.nn.init.uniform_(vector) self.word2idx[word] = len(self.word2idx) #为word2idx字典填充后,word2idx长度会加1 self.idx2word.append(word) print("word:",word) # torch.cat是将两个张量(tensor)拼接在一起 按维数0拼接(竖着拼) self.embedding_matrix = torch.cat([self.embedding_matrix,vector],0) print("embedding_matrix.shape",self.embedding_matrix.shape) def make_embedding(self,load=True): print("get embedding..") #加载embedding模型 if load: print("加载word to vec模型") self.get_w2v_model() else: raise NotImplementedError # 制作一个word2idx的字典 # 制作一个idx2word的list # 制作一个word2vector的list for i,word in enumerate(self.embedding.wv.key_to_index ): print('get words #{}'.format(i+1), end='\r') # 例:self.word2idx['李']=1 # self.idx2word[1]='李' # self.vector[1]='李' self.word2idx[word]=len(self.word2idx) self.idx2word.append(word) self.embedding_matrix.append(self.embedding.wv[word]) # 将embedding_matrix转为tensor类型 self.embedding_matrix = torch.tensor(self.embedding_matrix) # 将PAD和UNK加入embedding中 self.add_embedding("<PAD>") self.add_embedding("<UNK>") print("total words: {}".format(len(self.embedding_matrix))) return self.embedding_matrix def pad_sequence(self,sentence): # 将每个句子变成统一的长度 if len(sentence)>self.sen_len: sentence = sentence[:self.sen_len] #截断 else: pad_len = self.sen_len-len(sentence) for _ in range(pad_len): sentence.append(self.word2idx["<PAD>"]) assert len(sentence)==self.sen_len return sentence def sentence_word2idx(self): # 把句子里面的字转成对应的index sentence_list = [] for i,sen in enumerate(self.sentences): print('sentence count #{}'.format(i+1), end='\r') sentence_idx = [] for word in sen: if(word in self.word2idx.keys()): sentence_idx.append(self.word2idx[word]) else: sentence_idx.append(self.word2idx["<UNK>"]) # 把每个句子长度统一 sentence_idx = self.pad_sequence(sentence_idx) sentence_list.append(sentence_idx) return torch.LongTensor(sentence_list) def labels_to_tensor(self,y): #把标签label也转为tensor y = [int(label) for label in y] return torch.LongTensor(y)
def train_word2vec(x): # 训练word embedding """ Embedding 层的输入是一个二维整数张量, 形状为(samples,sequence_length),即(样本数,序列长度) Embedding 层输出是(samples,sequence_length,embedding_dimensionality) 的三维浮点数张量。 """ model = word2vec.Word2Vec(x,vector_size=250,window=5,min_count=5,workers=12,epochs=10,sg=1) #iter is epochs return model