transformers、torch train demo
通过 pytorch 训练模型的逻辑:
import torch.nn as nn import torch import numpy #from torch.utils.tensorboard import SummaryWriter import time vocabList = ["0","1","2","3","4","5","6","7","8","9"] class TwoLayerNet(nn.Module): def __init__(self, dim_in, dim_hide_1,dim_hide_2, dim_out): super(TwoLayerNet, self).__init__() self.linear1 = nn.Linear(dim_in, dim_hide_1, bias=True) self.linear2 = nn.Linear(dim_hide_1, dim_hide_2, bias=True) # 最后输出32维度 def forward(self, x): y_predict = self.linear2(self.linear1(x).clamp(min=0)) return y_predict if __name__ == "__main__": #writer = SummaryWriter('log') N = 5 # batch size D_in = 10 # 输入64 x 1000维 H1 = 10 # 100个隐藏单元 H2 = 15 # 100个隐藏单元 D_out = 1 # 输出100维 # 创建训练数据,这里是对训练数据进行随机初始化 x1_data = torch.randn(N, D_in) x2_data = torch.randn(N, D_in) y = torch.randn(N, 1) model = TwoLayerNet(D_in, H1, H2, D_out) # 模型 loss_fn = nn.MSELoss(reduction='sum') # 损失函数 optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # 优化器 #writer.add_graph(model, input_to_model = torch.rand(5,10)) #writer.close() for t in range(5): print("Start train : ---------- ",t) y_query = model(x1_data) # 前向传播 y_title = model(x2_data) # 前向传播 logits = torch.cosine_similarity(y_query,y_title) loss = loss_fn(logits, y) # 计算损失 loss.backward() # 反向传播 optimizer.step() # 更新权重 print(t, loss.item()) # 打印 optimizer.zero_grad() # 把模型内参数的梯度清零 for name, param in model.named_parameters(): print(name) print(param) if param.grad is not None: print(param.grad) print(param.grad.shape) paramGradValue = [t.numpy() for t in param.grad] print(paramGradValue)