为了整合CNN和RNN的优势,[Vaswani et al., 2017] 创新性地使用注意力机制设计了Transformer模型。该模型利用attention机制实现了并行化捕捉序列依赖,并且同时处理序列的每个位置的tokens,上述优势使得Transformer模型在性能优异的同时大大减少了训练时间。
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('/home/kesci/input/d2len9900')import d2l
以下是复制了上一小节中 masked softmax 实现,这里就不再赘述了。
defSequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)
X_len = X_len.to(X.device)#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen), dtype=torch.float, device=X.device)
mask = mask[None,:]< X_len[:,None]#print(mask)
X[~mask]=valuereturn X
def masked_softmax(X, valid_length):# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)if valid_length isNone:return softmax(X)else:
shape = X.shape
if valid_length.dim()==1:try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]else:
valid_length = valid_length.reshape((-1,))# fill masked elements with a large negative, whose exp is 0
X =SequenceMask(X.reshape((-1, shape[-1])), valid_length)return softmax(X).reshape(shape)# Save to the d2l package.classDotProductAttention(nn.Module):def __init__(self, dropout,**kwargs):super(DotProductAttention,self).__init__(**kwargs)self.dropout = nn.Dropout(dropout)# query: (batch_size, #queries, d)# key: (batch_size, #kv_pairs, d)# value: (batch_size, #kv_pairs, dim_v)# valid_length: either (batch_size, ) or (batch_size, xx)def forward(self, query, key,value, valid_length=None):
d = query.shape[-1]# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2))/ math.sqrt(d)
attention_weights =self.dropout(masked_softmax(scores, valid_length))return torch.bmm(attention_weights,value)
classMultiHeadAttention(nn.Module):def __init__(self, input_size, hidden_size, num_heads, dropout,**kwargs):super(MultiHeadAttention,self).__init__(**kwargs)self.num_heads = num_heads
self.attention =DotProductAttention(dropout)self.W_q = nn.Linear(input_size, hidden_size, bias=False)self.W_k = nn.Linear(input_size, hidden_size, bias=False)self.W_v = nn.Linear(input_size, hidden_size, bias=False)self.W_o = nn.Linear(hidden_size, hidden_size, bias=False)def forward(self, query, key,value, valid_length):# query, key, and value shape: (batch_size, seq_len, dim),# where seq_len is the length of input sequence# valid_length shape is either (batch_size, )# or (batch_size, seq_len).# Project and transpose query, key, and value from# (batch_size, seq_len, hidden_size * num_heads) to# (batch_size * num_heads, seq_len, hidden_size).
query = transpose_qkv(self.W_q(query),self.num_heads)
key = transpose_qkv(self.W_k(key),self.num_heads)value= transpose_qkv(self.W_v(value),self.num_heads)if valid_length isnotNone:# Copy valid_length by num_heads times
device = valid_length.device
valid_length = valid_length.cpu().numpy()if valid_length.is_cuda else valid_length.numpy()if valid_length.ndim ==1:
valid_length = torch.FloatTensor(np.tile(valid_length,self.num_heads))else:
valid_length = torch.FloatTensor(np.tile(valid_length,(self.num_heads,1)))
valid_length = valid_length.to(device)
output =self.attention(query, key,value, valid_length)
output_concat = transpose_output(output,self.num_heads)returnself.W_o(output_concat)
def transpose_qkv(X, num_heads):# Original X shape: (batch_size, seq_len, hidden_size * num_heads),# -1 means inferring its value, after first reshape, X shape:# (batch_size, seq_len, num_heads, hidden_size)
X = X.view(X.shape[0], X.shape[1], num_heads,-1)# After transpose, X shape: (batch_size, num_heads, seq_len, hidden_size)
X = X.transpose(2,1).contiguous()# Merge the first two dimensions. Use reverse=True to infer shape from# right to left.# output shape: (batch_size * num_heads, seq_len, hidden_size)
output = X.view(-1, X.shape[2], X.shape[3])return output
# Saved in the d2l package for later usedef transpose_output(X, num_heads):# A reversed version of transpose_qkv
X = X.view(-1, num_heads, X.shape[1], X.shape[2])
X = X.transpose(2,1).contiguous()return X.view(X.shape[0], X.shape[1],-1)
import numpy as np
pe =PositionalEncoding(20,0)
Y = pe(torch.zeros((1,100,20))).numpy()
d2l.plot(np.arange(100), Y[0,:,4:8].T, figsize=(6,2.5),
legend=["dim %d"% p for p in[4,5,6,7]])
3|0编码器
我们已经有了组成Transformer的各个模块,现在我们可以开始搭建了!编码器包含一个多头注意力层,一个position-wise FFN,和两个 Add and Norm层。对于attention模型以及FFN模型,我们的输出维度都是与embedding维度一致的,这也是由于残差连接天生的特性导致的,因为我们要将前一层的输出与原始输入相加并归一化。
classTransformerEncoder(d2l.Encoder):def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
num_heads, num_layers, dropout,**kwargs):super(TransformerEncoder,self).__init__(**kwargs)self.embedding_size = embedding_size
self.embed = nn.Embedding(vocab_size, embedding_size)self.pos_encoding =PositionalEncoding(embedding_size, dropout)self.blks = nn.ModuleList()for i in range(num_layers):self.blks.append(EncoderBlock(embedding_size, ffn_hidden_size,
num_heads, dropout))def forward(self, X, valid_length,*args):
X =self.pos_encoding(self.embed(X)* math.sqrt(self.embedding_size))for blk inself.blks:
X = blk(X, valid_length)return X
# test encoder
encoder =TransformerEncoder(200,24,48,8,2,0.5)
encoder(torch.ones((2,100)).long(), valid_length).shape
torch.Size([2,100,24])
4|0解码器
Transformer 模型的解码器与编码器结构类似,然而,除了之前介绍的几个模块之外,编码器部分有另一个子模块。该模块也是多头注意力层,接受编码器的输出作为key和value,decoder的状态作为query。与编码器部分相类似,解码器同样是使用了add and norm机制,用残差和层归一化将各个子层的输出相连。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人