[课堂笔记][pytorch学习][3]语言模型
第三节课笔记:
#注:随堂笔记中使用的torchtext已经更新,部分代码不能使用,请对照官方README使用 torchtext
创建vocabulary
import torchtext from torchtext.vocab import Vectors import torch import numpy as np import random USE_CUDA = torch.cuda.is_available() # 为了保证实验结果可以复现,我们经常会把各种random seed固定在某一个值 random.seed(53113) np.random.seed(53113) torch.manual_seed(53113) if USE_CUDA: torch.cuda.manual_seed(53113) BATCH_SIZE = 32 EMBEDDING_SIZE = 650 MAX_VOCAB_SIZE = 50000
- torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集
- BPTTIterator可以连续地得到连贯的句子
TEXT = torchtext.data.Field(lower=True) train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path=".", train="text8.train.txt", validation="text8.dev.txt", test="text8.test.txt", text_field=TEXT) TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE) print("vocabulary size: {}".format(len(TEXT.vocab))) VOCAB_SIZE = len(TEXT.vocab) train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits( (train, val, test), batch_size=BATCH_SIZE, device=-1, bptt_len=32, repeat=False, shuffle=True)
模型的输入是一串文字,模型的输出也是一串文字,他们之间相差一个位置,因为语言模型的目标是根据之前的单词预测下一个单词。
定义模型
import torch import torch.nn as nn class RNNModel(nn.Module): """ 一个简单的循环神经网络""" def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5): ''' 该模型包含以下几层: - 词嵌入层 - 一个循环神经网络层(RNN, LSTM, GRU) - 一个线性层,从hidden state到输出单词表 - 一个dropout层,用来做regularization ''' super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) if rnn_type in ['LSTM', 'GRU']: self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) else: try: nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] except KeyError: raise ValueError( """An invalid option for `--model` was supplied, options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) self.init_weights() self.rnn_type = rnn_type self.nhid = nhid self.nlayers = nlayers def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.zero_() self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, input, hidden): ''' Forward pass: - word embedding - 输入循环神经网络 - 一个线性层从hidden state转化为输出单词表 ''' emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden def init_hidden(self, bsz, requires_grad=True): weight = next(self.parameters()) if self.rnn_type == 'LSTM': return (weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad), weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)) else: return weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)
初始化模型
model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5) if USE_CUDA: model = model.cuda()
- 我们首先定义评估模型的代码。
- 模型的评估和模型的训练逻辑基本相同,唯一的区别是我们只需要forward pass,不需要backward pass
def evaluate(model, data): model.eval() total_loss = 0. it = iter(data) total_count = 0. with torch.no_grad(): hidden = model.init_hidden(BATCH_SIZE, requires_grad=False) for i, batch in enumerate(it): data, target = batch.text, batch.target if USE_CUDA: data, target = data.cuda(), target.cuda() hidden = repackage_hidden(hidden) with torch.no_grad(): output, hidden = model(data, hidden) loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) total_count += np.multiply(*data.size()) total_loss += loss.item()*np.multiply(*data.size()) loss = total_loss / total_count model.train() return loss
我们需要定义下面的一个function,帮助我们把一个hidden state和计算图之前的历史分离。
def repackage_hidden(h): if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h)