交作业 之 pytorch 使用字符级特征来增强 LSTM 词性标注器
------------恢复内容开始------------
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.nn.utils.rnn import pack_padded_sequence ,pad_sequence ,pack_sequence inputs = [("LIHUA went to The TsinghUA University".split(),["N V V N N N"]), ("Liping went to technical school ".split(),["N V V N N"]),
("I work in the mall ".split(),["N V V N N"]),
("we both have bright future".split(),["N N V ADJ N"])]
character_to_idx={}
word_to_idx={}
vocab=len(word_to_idx)
character_size=len(character_to_idx)
words_characters={}
for sen in inputs:
for word in str(sen[0]).split():
if word not in word_to_idx:
words_characters[len(word_to_idx)] = list(word)
word_to_idx[word]=len(word_to_idx)
for c in word:
character_to_idx[c]=len(character_to_idx)
START_TAG,STOP_TAG="START","STOP"
tag_to_ix = {"N": 0, "V": 1, "ADJ":2 ,START_TAG: 3, STOP_TAG: 4}#建立标签到索引的字典
tag_size=len(tag_to_ix)
def prepare_lookupTable(sequence,to_idx): 将 单词索引 和标签索引封装成 张量
idxs=[to_idx[word] for word in sequence]
return torch.tensor(idxs)
print(word_to_idx) for input in inputs: sentence, tag = input[0], input[1] print("{}对应的张量是{}".format(sentence,prepare_lookupTable(sentence,word_to_idx))) print("{}对应的张量是{}".format(tag,prepare_lookupTable(tag,tag_to_ix)))
运行结果
{'LIHUA': 0, 'went': 1, 'to': 2, 'The': 3, 'TsinghUA': 4, 'University': 5, 'Liping': 6, 'technical': 7, 'school': 8, 'I': 9, 'work': 10, 'in': 11, 'the': 12, 'mall': 13, 'we': 14, 'both': 15, 'have': 16, 'bright': 17, 'future': 18}
['LIHUA', 'went', 'to', 'The', 'TsinghUA', 'University']对应的张量是tensor([0, 1, 2, 3, 4, 5])
['N', 'V', 'V', 'N', 'N', 'N']对应的张量是tensor([0, 1, 1, 0, 0, 0])
['Liping', 'went', 'to', 'technical', 'school']对应的张量是tensor([6, 1, 2, 7, 8])
['N', 'V', 'V', 'N', 'N']对应的张量是tensor([0, 1, 1, 0, 0])
['I', 'work', 'in', 'the', 'mall']对应的张量是tensor([ 9, 10, 11, 12, 13])
['N', 'V', 'V', 'N', 'N']对应的张量是tensor([0, 1, 1, 0, 0])
['we', 'both', 'have', 'bright', 'future']对应的张量是tensor([14, 15, 16, 17, 18])
['N', 'N', 'V', 'ADJ', 'N']对应的张量是tensor([0, 0, 1, 2, 0])
WORD_EMBEDDING_DIM = 6
CHARACTER_EMBEDDING_DIM = 4
HIDDEN_DIM = 6
CHARACTER_HIDDEN_DIM = 6
class LSTMTagger(nn.Module):
def __init__(self, word_embedding_dim,c_embedding_dim, hidden_dim,c_hidden_dim, vocab_size, c_size,tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.character_dim=c_embedding_dim
self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
self.character_embeddings=nn.Embedding(c_size,c_embedding_dim)
self.lstm_c = nn.LSTM(c_embedding_dim, c_hidden_dim) # LSTM_c 以每个字符的c_embeddings作为输入, 输出即为该单词对应字符级别的特征 输出维度为 c_hidden_dim 的隐藏状态值
self.tag_lstm = nn.LSTM(word_embedding_dim + c_hidden_dim, hidden_dim) ## tag_lstm以word_embeddings和该词字符级别特征的拼接向量作为输入, 输出维度为 hidden_dim 的隐藏状态值
self.hidden2tag = nn.Linear(hidden_dim, tagset_size) # 线性层将隐藏状态空间映射到标注空间
self.hidden_tag = self.init_hidden(self.hidden_dim)
self.hidden_character = self.init_hidden(c_hidden_dim)
def init_hidden(self,hiddem_dim):
# 一开始并没有隐藏状态所以我们要先初始化一个, 各个维度的含义是 (num_layers, batch_size, hidden_dim)
return (torch.zeros(1, 1, self.hidden_dim),
torch.zeros(1, 1, self.hidden_dim))
def forward(self, sentence,word_characters):
cat_embeds=[]
for word in sentence:
print("semtence是{}".format(sentence))
print("word是{}".format(word))
word_embed= self.word_embeddings(word)
print("word_embed是{}".format(word_embed))
word_character=word_characters[word.item()] #item() 方法是用来将只有一个元素的numpy数组或tensor张量转化为标量的方法
print("word_c是{}".format(word_character))
word_character_in = prepare_lookupTable(word_character, character_to_idx)
print("word_c_in是{}".format(word_character_in))
c_embeds = self.character_embeddings(word_character_in)
print(c_embeds)
lstm_out_c, self.hidden_character = self.lstm_c(c_embeds.view(len(word_character_in), 1, -1), self.hidden_character)
cat_emb=torch.cat((word_embed,self.hidden_character[0].view(-1)))
cat_embeds.append(cat_emb)
embeds = torch.cat(cat_embeds).view(len(sentence), 1, -1)
lstm_out, self.hidden_tag = self.tag_lstm(embeds, self.hidden_tag)
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
print(tag_space)
tag_scores = F.log_softmax(tag_space, dim=1)
return tag_scores
model = LSTMTagger(WORD_EMBEDDING_DIM, CHARACTER_EMBEDDING_DIM, HIDDEN_DIM, CHARACTER_HIDDEN_DIM, vocab,c_size,tag_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
with torch.no_grad(): inputs1 = prepare_lookupTable(inputs[0][0], word_to_idx) tag_scores = model(inputs1, words_characters) print("训练前的分数:\n") print(tag_scores) for epoch in range(2): # 此例中我是随便设了一个值 for input in inputs: # 第一步: 请记住Pytorch会累加梯度. 我们需要在训练每个实例前清空梯度 sentence,tag=input[0],input[1] model.zero_grad() # 此外还需要清空 LSTM 的隐状态, # 将其从上个实例的历史中分离出来. model.hidden_tag = model.init_hidden(HIDDEN_DIM) model.hidden_character = model.init_hidden(CHARACTER_HIDDEN_DIM) # 准备网络输入, 将其变为词索引的 Tensor 类型数据 sentence_in = prepare_lookupTable(sentence, word_to_idx) targets = prepare_lookupTable(tag, tag_to_ix) # print(sentence_in,targets) # 第三步: 前向传播. tag_scores = model(sentence_in, words_characters) # print(tag_scores) # 第四步: 计算损失和梯度值, 通过调用 optimizer.step() 来更新梯度 loss = loss_function(tag_scores, targets) loss.backward() optimizer.step() with torch.no_grad(): inputs1 = prepare_lookupTable(inputs[0][0], word_to_idx) tag_scores = model(inputs1, words_characters) print("训练前的分数:\n") print(tag_scores)
semtence是tensor([0, 1, 2, 3, 4, 5]) word是0 word_embed是tensor([1.7303e+25, 3.4732e-12, 1.7743e+28, 2.0535e-19, 1.7260e+25, 3.4589e-12]) word_c是['L', 'I', 'H', 'U', 'A'] word_c_in是tensor([18, 22, 2, 15, 15]) c_embeds是tensor([[ 1.2612e-44, 4.4766e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -2.0000e+00, 0.0000e+00, -2.0000e+00], [ 9.8091e-45, 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4013e-45, 0.0000e+00], [ 0.0000e+00, -2.0000e+00, -1.9407e-19, 4.6577e-10]]) semtence是tensor([0, 1, 2, 3, 4, 5]) word是1 word_embed是tensor([ 0.0000e+00, -2.0000e+00, 0.0000e+00, -2.0000e+00, 8.4078e-45, 0.0000e+00]) word_c是['w', 'e', 'n', 't'] word_c_in是tensor([24, 27, 23, 27]) c_embeds是tensor([[4.2039e-43, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.4718e-35, 4.5775e-41, 3.4718e-35, 4.5775e-41], [3.4718e-35, 4.5775e-41, 1.4013e-45, 0.0000e+00]]) semtence是tensor([0, 1, 2, 3, 4, 5]) word是2 word_embed是tensor([ 0.0000e+00, -2.0000e+00, 0.0000e+00, -2.0000e+00, 9.8091e-45, 0.0000e+00]) word_c是['t', 'o'] word_c_in是tensor([27, 25]) c_embeds是tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 5.2698e-29, 1.4013e-45]]) semtence是tensor([0, 1, 2, 3, 4, 5]) word是3 word_embed是tensor([ 0.0000e+00, -2.0000e+00, -1.5485e-19, 2.5250e-29, 1.1210e-44, 0.0000e+00]) word_c是['T', 'h', 'e'] word_c_in是tensor([12, 25, 27]) c_embeds是tensor([[ 0.0000e+00, 0.0000e+00, 5.2698e-29, 1.4013e-45], [ nan, 0.0000e+00, 1.1210e-44, 0.0000e+00], [-1.9397e-19, -2.5250e-29, 5.4124e-29, 1.4013e-45]]) semtence是tensor([0, 1, 2, 3, 4, 5]) word是4 word_embed是tensor([ 0.2962, 0.2165, -0.2325, -0.0868, 0.2969, 0.3876]) word_c是['T', 's', 'i', 'n', 'g', 'h', 'U', 'A'] word_c_in是tensor([12, 22, 25, 23, 25, 25, 15, 15]) c_embeds是tensor([[5.4121e-29, 1.4013e-45, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]]) semtence是tensor([0, 1, 2, 3, 4, 5]) word是5 word_embed是tensor([5.7439e-21, 1.4013e-45, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]) word_c是['U', 'n', 'i', 'v', 'e', 'r', 's', 'i', 't', 'y'] word_c_in是tensor([15, 23, 25, 25, 27, 27, 22, 25, 27, 17]) c_embeds是tensor([[1.7937e-43, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.6343e-35, 4.5775e-41, 1.6343e-35, 4.5775e-41], [1.6343e-35, 4.5775e-41, 0.0000e+00, 0.0000e+00], [1.7236e-43, 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.6343e-35, 4.5775e-41, 1.6343e-35, 4.5775e-41], [1.6343e-35, 4.5775e-41, 1.4013e-45, 0.0000e+00], [1.6343e-35, 4.5775e-41, 0.0000e+00, 0.0000e+00]]) tag_spacetensor([[-0.4361, 0.1883, -0.4384, 0.2209, -0.1222], [-0.3599, 0.3346, -0.2629, 0.0029, -0.1098], [-0.3524, 0.3924, -0.2335, -0.0591, -0.1544], [ nan, nan, nan, nan, nan], [ nan, nan, nan, nan, nan], [ nan, nan, nan, nan, nan]]) 训练前的分数: tensor([[-1.9689, -1.3445, -1.9712, -1.3119, -1.6549], [-1.9208, -1.2262, -1.8237, -1.5579, -1.6707], [-1.9158, -1.1709, -1.7968, -1.6225, -1.7178], [ nan, nan, nan, nan, nan], [ nan, nan, nan, nan, nan], [ nan, nan, nan, nan, nan]])