56门控循环单元GRU

image
image

点击查看代码
import torch
from torch import nn
from d2l import torch as d2l

"""
额外的控制单元
"""

batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

# 初始化模型参数
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return torch.randn(size=shape, device=device)*0.01

    def three():
        return (normal((num_inputs, num_hiddens)),
                normal((num_hiddens, num_hiddens)),
                torch.zeros(num_hiddens, device=device))

    W_xz, W_hz, b_z = three()  # 更新门参数
    W_xr, W_hr, b_r = three()  # 重置门参数
    W_xh, W_hh, b_h = three()  # 候选隐状态参数
    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    # 附加梯度
    params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

# 定义隐状态的初始化函数
def init_gru_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device), )

# 定义门控循环单元模型
def gru(inputs, state, params):
    W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    # (num_steps, batch_size, vocab_size)
    for X in inputs:
        # 𝐙𝑡=𝜎(𝐗𝑡𝐖𝑥𝑧+𝐇𝑡−1𝐖ℎ𝑧+𝐛𝑧)
        Z = torch.sigmoid((X @ W_xz) + (H @ W_hz) + b_z)
        # 𝐑𝑡=𝜎(𝐗𝑡𝐖𝑥𝑟+𝐇𝑡−1𝐖ℎ𝑟+𝐛𝑟)
        R = torch.sigmoid((X @ W_xr) + (H @ W_hr) + b_r)
        # 𝐇̃𝑡=tanh(𝐗𝑡𝐖𝑥ℎ+(𝐑𝑡⊙𝐇𝑡−1)𝐖ℎℎ+𝐛ℎ)
        # ⊙ 按元素乘法
        H_tilda = torch.tanh((X @ W_xh) + ((R * H) @ W_hh) + b_h)
        # 𝐇𝑡=𝐙𝑡⊙𝐇𝑡−1+(1−𝐙𝑡)⊙𝐇̃𝑡
        H = Z * H + (1 - Z) * H_tilda
        # 输出
        Y = H @ W_hq + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H,)

# 训练
vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params,
                            init_gru_state, gru)
# d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)


# 简洁实现
# 矩阵乘法
num_inputs = vocab_size
"""
input_size – The number of expected features in the input x
hidden_size – The number of features in the hidden state h
num_layers – Number of recurrent layers. 
             E.g., setting num_layers=2 would mean stacking two GRUs 
             together to form a stacked GRU, with the second GRU 
             taking in outputs of the first GRU and computing the final results. 
             Default: 1
bias – If False, then the layer does not use bias weights b_ih and b_hh. 
       Default: True
batch_first – If True, then the input and output tensors are provided 
              as (batch, seq, feature) instead of (seq, batch, feature). 
              Note that this does not apply to hidden or cell states. 
              See the Inputs/Outputs sections below for details. 
              Default: False
dropout – If non-zero, introduces a Dropout layer on the outputs of 
          each GRU layer except the last layer, with dropout probability 
          equal to dropout. 
          Default: 0
bidirectional – If True, becomes a bidirectional GRU. 
                Default: False
"""
gru_layer = nn.GRU(num_inputs, num_hiddens)
# model = d2l.RNNModel(gru_layer, len(vocab))
# model = model.to(device)
# d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
# torch.save(model.state_dict(), 'gru.params')

clone = d2l.RNNModel(gru_layer, vocab_size=len(vocab))
clone = clone.to(device)
clone.load_state_dict(torch.load('gru.params'))
re = d2l.predict_ch8('time traveller', 15, clone, vocab, device)
print(re)
posted @ 2022-07-04 14:24  荒北  阅读(46)  评论(0编辑  收藏  举报