python 实现 Multi-Head Attention
1.LSTM 预测股票
2.python 实现 Multi-Head Attention
3.高斯混合模型(GMM)和EM算法 —— python实现4.SVM-SMO 算法 python版5.CFAR 检测海上船舶目标完整代码
点击查看代码
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F
# multi-head attention
class selfAttention(nn.Module):
def __init__(self, num_attention_heads, input_size, d_model, Q=None, K=None, V=None):
super(selfAttention, self).__init__()
if d_model % num_attention_heads != 0:
raise ValueError(
"the hidden size %d is not a multiple of the number of attention heads"
"%d" % (d_model, num_attention_heads)
)
self.num_attention_heads = num_attention_heads
self.d_k = int(d_model / num_attention_heads) # dk = dv (Q, K, V 的最后一个维度)
self.all_head_size = d_model # 所有头加起来的总维度大小
self.key_layer = nn.Linear(input_size, d_model,
bias=False) # input_size:输入向量的最后一个的维度,d_model:输出的维度,它会被分割成多个注意力头
self.query_layer = nn.Linear(input_size, d_model, bias=False)
self.value_layer = nn.Linear(input_size, d_model, bias=False)
self.Q = Q
self.K = K
self.V = V
def trans_to_multiple_heads(self, x):
new_size = x.size()[: -1] + (self.num_attention_heads, self.d_k)
x = x.view(new_size)
return x.permute(0, 2, 1, 3)
def forward(self, x):
if self.Q is None or self.K is None or self.V is None:
key = self.key_layer(x) # [batch_size, seq_len, d_model]
query = self.query_layer(x) # [batch_size, seq_len, d_model]
value = self.value_layer(x) # [batch_size, seq_len, d_model]
else:
key = self.K
query = self.Q
value = self.V
key_heads = self.trans_to_multiple_heads(
key) # [batch_size, num_attention_heads, seq_len, d_k]
query_heads = self.trans_to_multiple_heads(
query) # [batch_size, num_attention_heads, seq_len, d_k]
value_heads = self.trans_to_multiple_heads(
value) # [batch_size, num_attention_heads, seq_len, d_k]
attention_scores_before_normalize = torch.matmul(
query_heads, key_heads.permute(0, 1, 3, 2)) # [batch_size, num_attention_heads, seq_len, seq_len]
attention_scores = attention_scores_before_normalize / math.sqrt(self.d_k)
attention_probs = F.softmax(attention_scores, dim=-1)
result = torch.matmul(attention_probs,
value_heads) # [batch_size, num_attention_heads, seq_len, d_k] 与 QKV 等大小
result = result.permute(0, 2, 1, 3).contiguous() # [batch_size, seq_len, num_attention_heads, d_k]
new_size = result.size()[: -2] + (self.all_head_size,) # [batch_size, seq_len, d_model]
result = result.view(*new_size)
print("attention_scores(before normalize) shape: ", attention_scores_before_normalize.shape)
print("dk", self.d_k)
print("attention_scores(after normalize) shape: ", attention_scores.shape)
print("attention_probs shape(after softmax): ", attention_probs.shape)
print("result shape: ", result.shape)
return result
if __name__ == "__main__":
# batch_size = 1
# seq_len = 4
# x_size = 16
# input_size = 16
# d_model = 16
#
# num_attention_heads = 1
#
# X = torch.Tensor(
# [[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], # X1
# [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0], # X2
# [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], # X3
# [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]] # X4
# ) # [batch_size, seq_len, x_size]
# x = X.view(batch_size, seq_len, x_size)
# print("x: ", x)
#
# # embedding
# i = torch.eye(d_model) # 单位阵
# w = torch.Tensor(
# [[1, 0, 0, 0],
# [0, 1, 0, 0],
# [0, 0, 1, 0],
# [0, 0, 0, 1], ]
# ) # [x_size, input_size]
# w = i
# w = w.view(x_size, input_size)
#
# X = torch.matmul(X, w) # [batch_size, seq_len, input_size]
# print("a: ", X)
#
# # QKV
# wq = torch.Tensor(
# [[1, 0, 1],
# [1, 0, 0],
# [0, 0, 1],
# [0, 1, 1],
# ]
# )
# wk = torch.Tensor(
# [[0, 0, 1],
# [1, 1, 0],
# [0, 1, 0],
# [1, 1, 0],
# ]
# )
# wv = torch.Tensor(
# [[0, 2, 0],
# [0, 3, 0],
# [1, 0, 3],
# [1, 1, 0],
# ]
# ) # [input_size, d_model]
# wq = i
# wk = i
# wv = i
# wq = wq.view(input_size, d_model)
# wk = wk.view(input_size, d_model)
# wv = wv.view(input_size, d_model)
#
# Q = torch.matmul(X, wq) # [batch_size, seq_len, d_model]
# K = torch.matmul(X, wk) # [batch_size, seq_len, d_model]
# V = torch.matmul(X, wv) # [batch_size, seq_len, d_model]
# Q = Q.view(batch_size, seq_len, d_model)
# K = K.view(batch_size, seq_len, d_model)
# V = V.view(batch_size, seq_len, d_model)
# print("Q: ", Q, "\nK: ", K, "\nV: ", V)
#
# # self attention
# attention = selfAttention(num_attention_heads=num_attention_heads, # num_attention_heads:注意力头的数量
# input_size=input_size, # input_size:输入向量的最后一个的维度
# d_model=d_model, # d_model:输出的维度,它会被分割成多个注意力头
# Q=Q, K=K, V=V
# )
# result = attention.forward(X)
# print("result: ", result)
# print("result shape: ", result.shape) # [batch_size, seq_len, d_model]
features = torch.rand((32, 20, 10))
attention = selfAttention(2, 10, 16)
result = attention.forward(features)
print(result.shape)
原理简介
Self-Attention Layer 基本结构如下:
1. 首先对于每个输入 ,经过 Embedding 层对每个输入进行编码得到
batch_size = 1
seq_len = 3
x_size = 4
input_size = 4 # embedding size
hidden_size = 3 # (q,k,v) size
num_attention_heads = 1
X = torch.Tensor(
[[1,0,1,0], # X1
[0,2,0,2], # X2
[1,1,1,1]] # X3
) # [batch_size, seq_len, x_size]
x = X.view(batch_size, seq_len, x_size)
print("x: ", x)
w = torch.Tensor(
[[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1], ]
) # [x_size, input_size]
w = w.view(x_size, input_size)
X = torch.matmul(X, w) # [batch_size, seq_len, input_size]
print("a: ", X)
x: tensor([[[1., 0., 1., 0.],
[0., 2., 0., 2.],
[1., 1., 1., 1.]]])
a: tensor([[1., 0., 1., 0.],
[0., 2., 0., 2.],
[1., 1., 1., 1.]])
2. 然后将输入特征经过三个全连接层分别得到 Query,Key,Value:
wq = torch.Tensor(
[[1, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 1, 1],
]
)
wk = torch.Tensor(
[[0, 0, 1],
[1, 1, 0],
[0, 1, 0],
[1, 1, 0],
]
)
wv = torch.Tensor(
[[0, 2, 0],
[0, 3, 0],
[1, 0, 3],
[1, 1, 0],
]
) # [input_size, hidden_size]
wq = wq.view(input_size, hidden_size)
wk = wk.view(input_size, hidden_size)
wv = wv.view(input_size, hidden_size)
Q = torch.matmul(X, wq) # [batch_size, seq_len, hidden_size]
K = torch.matmul(X, wk) # [batch_size, seq_len, hidden_size]
V = torch.matmul(X, wv) # [batch_size, seq_len, hidden_size]
Q = Q.view(batch_size, seq_len, hidden_size)
K = K.view(batch_size, seq_len, hidden_size)
V = V.view(batch_size, seq_len, hidden_size)
print("Q: ", Q, "\nK: ", K, "\nV: ", V)
Q: tensor([[[1., 0., 2.],
[2., 2., 2.],
[2., 1., 3.]]])
K: tensor([[[0., 1., 1.],
[4., 4., 0.],
[2., 3., 1.]]])
V: tensor([[[1., 2., 3.],
[2., 8., 0.],
[2., 6., 3.]]])
3. 计算注意力。注意力矩阵是由 Query 和 Key 计算得到,方式由许多种,如点积、缩放点积等。Value 可以看作是信息提取器,将根据单词的注意力提取一个唯一的值,也即某个特征有多少成分被提取出来。
# attention
# Q k^T / sqrt(dk)
scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(hidden_size) # [batch_size, seq_len, seq_len]
print("scores: \n", scores)
# softmax
attn_weights = F.softmax(scores, dim=-1) # [batch_size, seq_len, seq_len]
print("attn_weights: \n", attn_weights)
# weighted sum
attn_output = torch.matmul(attn_weights, V) # [batch_size, seq_len, hidden_size]
print("attn_output: \n", attn_output)
scores:
tensor([[[1.1547, 2.3094, 2.3094],
[2.3094, 9.2376, 6.9282],
[2.3094, 6.9282, 5.7735]]])
attn_weights:
tensor([[[1.3613e-01, 4.3194e-01, 4.3194e-01],
[8.9045e-04, 9.0884e-01, 9.0267e-02],
[7.4449e-03, 7.5471e-01, 2.3785e-01]]])
attn_output:
tensor([[[1.8639, 6.3194, 1.7042],
[1.9991, 7.8141, 0.2735],
[1.9926, 7.4796, 0.7359]]])
4. 多头注意力
在上述的 self-attention 中,我们最终只得到一个注意力矩阵,也就是说这个注意力矩阵所关注的信息只偏句子之间的一种关系,但是在时序序列中,往往特征之间不止一种关系,所以我们要提取多个注意力矩阵,这样可以捕获更多的信息,这种注意力机制也就是 多头注意力机制(Multi-Heads)。
其中,
# multi-head attention
class selfAttention(nn.Module):
def __init__(self, num_attention_heads, input_size, d_model, Q=None, K=None, V=None):
super(selfAttention, self).__init__()
if d_model % num_attention_heads != 0:
raise ValueError(
"the hidden size %d is not a multiple of the number of attention heads"
"%d" % (d_model, num_attention_heads)
)
self.num_attention_heads = num_attention_heads
self.d_k = int(d_model / num_attention_heads) # dk = dv (Q, K, V 的最后一个维度)
self.all_head_size = d_model # 所有头加起来的总维度大小
self.key_layer = nn.Linear(input_size, d_model, bias=False) # input_size:输入向量的最后一个的维度,d_model:输出的维度,它会被分割成多个注意力头
self.query_layer = nn.Linear(input_size, d_model, bias=False)
self.value_layer = nn.Linear(input_size, d_model, bias=False)
self.Q = Q
self.K = K
self.V = V
def trans_to_multiple_heads(self, x):
new_size = x.size()[: -1] + (self.num_attention_heads, self.d_k)
x = x.view(new_size)
return x.permute(0, 2, 1, 3)
def forward(self, x):
if self.Q is None or self.K is None or self.V is None:
key = self.key_layer(x) # [batch_size, seq_len, d_model]
query = self.query_layer(x) # [batch_size, seq_len, d_model]
value = self.value_layer(x) # [batch_size, seq_len, d_model]
else:
key = self.K
query = self.Q
value = self.V
key_heads = self.trans_to_multiple_heads(
key) # [batch_size, num_attention_heads, seq_len, d_k]
query_heads = self.trans_to_multiple_heads(
query) # [batch_size, num_attention_heads, seq_len, d_k]
value_heads = self.trans_to_multiple_heads(
value) # [batch_size, num_attention_heads, seq_len, d_k]
attention_scores_before_normalize = torch.matmul(
query_heads, key_heads.permute(0, 1, 3, 2)) # [batch_size, num_attention_heads, seq_len, seq_len]
attention_scores = attention_scores_before_normalize / math.sqrt(self.d_k)
attention_probs = F.softmax(attention_scores, dim=-1)
result = torch.matmul(attention_probs,
value_heads) # [batch_size, num_attention_heads, seq_len, d_k] 与 QKV 等大小
result = result.permute(0, 2, 1, 3).contiguous() # [batch_size, seq_len, num_attention_heads, d_k]
new_size = result.size()[: -2] + (self.all_head_size,) # [batch_size, seq_len, d_model]
result = result.view(*new_size)
print("attention_scores(before normalize) shape: ", attention_scores_before_normalize.shape)
print("dk", self.d_k)
print("attention_scores(after normalize) shape: ", attention_scores.shape)
print("attention_probs shape(after softmax): ", attention_probs.shape)
print("result shape: ", result.shape)
return result
# 随机输入一个 [batch, seq_len, input] 的向量测试一下
# 结果应该是 [batch_size, seq_len, d_model] 的tensor
x = torch.rand((32, 20, 10))
attention = selfAttention(2, 10, 16)
result = attention.forward(x)
print(result.shape) # [32, 20, 16]: [batch_size, seq_len, d_model]
attention_scores(before normalize) shape: torch.Size([32, 2, 20, 20])
dk 8
attention_scores(after normalize) shape: torch.Size([32, 2, 20, 20])
attention_probs shape(after softmax): torch.Size([32, 2, 20, 20])
result shape: torch.Size([32, 20, 16])
torch.Size([32, 20, 16])
可视化
# 对一幅 8*8*1 的图片进行 self-attention
# 切成 4*4 的小块,每个小块 16 个像素点,共 4 个小块
# 0, 0, 0, 0 | 0, 0, 0, 0 |
# 0, 1, 1, 0 | 1, 1, 1, 1 X1 | X2
# 0, 0, 0, 0 | 0, 0, 0, 0 |
# 0, 1, 1, 0 | 0, 1, 1, 0 |
# --------------------------- ------------------
# 0, 0, 0, 0 | 0, 0, 0, 0 |
# 0, 1, 1, 0 | 1, 1, 1, 1 X3 | X4
# 0, 0, 0, 0 | 0, 0, 0, 0 |
# 0, 1, 1, 0 | 0, 1, 1, 0 |
X = torch.Tensor(
[[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], # X1
[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0], # X2
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], # X3
[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]] # X4
) # [batch_size, seq_len, x_size]
batch_size = 1
seq_len = 4
x_size = 16
input_size = 16
d_model = 16
num_attention_heads = 1
x = X.view(batch_size, seq_len, x_size)
print("x: ", x)
# embedding
i = torch.eye(x_size) # 单位阵
w = i
w = w.view(x_size, input_size)
X = torch.matmul(X, w) # [batch_size, seq_len, input_size]
print("a: ", X)
# QKV
i = torch.eye(d_model) # 单位阵
wq = i
wk = i
wv = i
wq = wq.view(input_size, d_model)
wk = wk.view(input_size, d_model)
wv = wv.view(input_size, d_model)
Q = torch.matmul(X, wq) # [batch_size, seq_len, d_model]
K = torch.matmul(X, wk) # [batch_size, seq_len, d_model]
V = torch.matmul(X, wv) # [batch_size, seq_len, d_model]
Q = Q.view(batch_size, seq_len, d_model)
K = K.view(batch_size, seq_len, d_model)
V = V.view(batch_size, seq_len, d_model)
print("Q: ", Q, "\nK: ", K, "\nV: ", V)
# self attention
attention = selfAttention(num_attention_heads=num_attention_heads, # num_attention_heads:注意力头的数量
input_size=input_size, # input_size:输入向量的最后一个的维度
d_model=d_model, # d_model:输出的维度,它会被分割成多个注意力头
Q=Q, K=K, V=V
)
result = attention.forward(X)
print("result: ", result)
print("result shape: ", result.shape) # [batch_size, seq_len, d_model]
参考
https://blog.csdn.net/weixin_53598445/article/details/125009686
https://blog.csdn.net/baidu_33000721/article/details/137880166
https://www.bilibili.com/video/BV1Rz4y1g7J7
https://zhuanlan.zhihu.com/p/420820453
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· .NET10 - 预览版1新功能体验(一)