5502循环神经网络简洁实现

点击查看代码

import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l



batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

# 定义模型
num_hiddens = 256
# 输入输出为多少，隐藏为多少
"""
input_size – The number of expected features in the input x
hidden_size – The number of features in the hidden state h
num_layers – Number of recurrent layers. 
             E.g., setting num_layers=2 would mean stacking two RNNs 
             together to form a stacked RNN, with the second RNN taking 
             in outputs of the first RNN and computing the final results. 
             Default: 1
nonlinearity – The non-linearity to use. 
               Can be either 'tanh' or 'relu'. 
               Default: 'tanh'
bias – If False, then the layer does not use bias weights b_ih and b_hh. 
                 Default: True
batch_first – If True, then the input and output tensors are provided as 
              (batch, seq, feature) instead of (seq, batch, feature). 
              Note that this does not apply to hidden or cell states. 
              See the Inputs/Outputs sections below for details. 
              Default: False (seq, batch, feature)
dropout – If non-zero, introduces a Dropout layer on the outputs of each RNN layer 
          except the last layer, with dropout probability equal to dropout. 
          Default: 0
bidirectional – If True, becomes a bidirectional RNN. 
                Default: False
"""
# Inputs: input, h_0
# Outputs: output, h_n
# 无输出层
rnn_layer = nn.RNN(len(vocab), num_hiddens)

# 初始化隐藏状态
# （ - ，批量大小，隐藏单元数）
state = torch.zeros((1, batch_size, num_hiddens))

# 通过一个隐状态和一个输入，我们就可以用更新后的隐状态计算输出。
X = torch.rand(size=(num_steps, batch_size, len(vocab)))
Y, state_new = rnn_layer(X, state)
print(Y.shape, state_new.shape)
"""
(seq, batch, feature)
torch.Size([35, 32, 256]) torch.Size([1, 32, 256])
"""
#@save
class RNNModel(nn.Module):
    """循环神经网络模型"""
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        # 只包括隐藏层，不包括输出层
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        # 如果RNN是双向的（之后将介绍），num_directions应该是2，否则应该是1
        # 需要构造自己的输出层
        if not self.rnn.bidirectional:
            self.num_directions = 1
            self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
        else:
            self.num_directions = 2
            self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size)

    def forward(self, inputs, state):
        # long————————torch.int64
        # 转化：tensor.long()或tensor.type(torch.int64)
        X = F.one_hot(inputs.T.long(), self.vocab_size)
        X = X.to(torch.float32)
        Y, state = self.rnn(X, state)
        # 全连接层首先将Y的形状改为(时间步数*批量大小,隐藏单元数)
        # 它的输出形状是(时间步数*批量大小,词表大小)。
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, device, batch_size=1):
        if not isinstance(self.rnn, nn.LSTM):
            # nn.GRU以张量作为隐状态
            return torch.zeros((self.num_directions * self.rnn.num_layers,
                                 batch_size, self.num_hiddens),
                                device=device)
        else:
            # nn.LSTM以元组作为隐状态
            return (torch.zeros((
                        self.num_directions * self.rnn.num_layers,
                        batch_size, self.num_hiddens), device=device),
                    torch.zeros((
                        self.num_directions * self.rnn.num_layers,
                        batch_size, self.num_hiddens), device=device))

# 基于一个具有随机权重的模型进行预测
device = d2l.try_gpu()
net = RNNModel(rnn_layer, vocab_size=len(vocab))
net = net.to(device)
re = d2l.predict_ch8('time traveller', 10, net, vocab, device)
print(re)

num_epochs, lr = 500, 1
# 速度提升
# 从零开始，一堆小矩阵实现
# 框架实现，将一堆小矩阵乘变成一个大矩阵乘
# 多次小矩阵开销大于大矩阵开销
# d2l.train_ch8(net, train_iter, vocab, lr, num_epochs, device)
# torch.save(net.state_dict(), 'rnn.params')
clone = RNNModel(rnn_layer, vocab_size=len(vocab))
clone = clone.to(device)
clone.load_state_dict(torch.load('rnn.params'))
re = d2l.predict_ch8('time traveller', 10, net, vocab, device)
print(re)