import torch
from torch import nn
from d2l import torch as d2l
batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)
# 下一步是初始化模型参数。 我们从标准差为 的高斯分布中提取权重,
# 并将偏置项设为 超参数num_hidden定义隐藏单元的数量, 实例化与更新门、重置门、候选隐状态和输出层相关的所有权重和偏置。
def get_params(vocab_size, num_hidden, device):
num_inputs = num_outputs = vocab_size
def normal(shape):
return torch.randn(size=shape, device=device)*0.01
def three():
return (normal((num_inputs, num_hidden)),
normal((num_hidden, num_hidden)),
torch.zeros(num_hidden, device=device))
w_xz, w_hz, b_z = three() # 更新门参数
w_xr, w_hr, b_r = three() # 重置门参数
w_xh, w_hh, b_h = three() # 候选隐状态参数
# 输出层参数
w_hq = normal((num_hidden, num_outputs))
b_q = torch.zeros(num_outputs, device=device)
# 附加梯度
params = [w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q]
for param in params:
param.requires_grad_(True)
return params
def init_gru_state(batch_size, num_hidden, device):
return (torch.zeros((batch_size, num_hidden), device=device),)
def gru(inputs, state, params):
w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q = params
h, = state
outputs = []
for x in inputs:
z = torch.sigmoid((x @ w_xz) + (h @ w_hz) + b_z) # @ 是矩阵乘法
r = torch.sigmoid((x @ w_xr) + (h @ w_hr) + b_r)
h_tilda = torch.tanh((x @ w_xh) + ((r * h) @ w_hh) + b_h)
h = z * h + (1 - z) * h_tilda
y = h @ w_hq + b_q
outputs.append(y)
return torch.cat(outputs, dim=0), (h,)
vocab_size, num_hidden, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hidden, device, get_params,
init_gru_state, gru)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!