python 实现 Multi-Head Attention

完整代码

点击查看代码
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F


# multi-head attention
class selfAttention(nn.Module):
    def __init__(self, num_attention_heads, input_size, d_model, Q=None, K=None, V=None):
        super(selfAttention, self).__init__()
        if d_model % num_attention_heads != 0:
            raise ValueError(
                "the hidden size %d is not a multiple of the number of attention heads"
                "%d" % (d_model, num_attention_heads)
            )

        self.num_attention_heads = num_attention_heads
        self.d_k = int(d_model / num_attention_heads)  # dk = dv (Q, K, V 的最后一个维度)
        self.all_head_size = d_model  # 所有头加起来的总维度大小

        self.key_layer = nn.Linear(input_size, d_model,
                                   bias=False)  # input_size:输入向量的最后一个的维度,d_model:输出的维度,它会被分割成多个注意力头
        self.query_layer = nn.Linear(input_size, d_model, bias=False)
        self.value_layer = nn.Linear(input_size, d_model, bias=False)

        self.Q = Q
        self.K = K
        self.V = V

    def trans_to_multiple_heads(self, x):
        new_size = x.size()[: -1] + (self.num_attention_heads, self.d_k)
        x = x.view(new_size)
        return x.permute(0, 2, 1, 3)

    def forward(self, x):
        if self.Q is None or self.K is None or self.V is None:
            key = self.key_layer(x)  # [batch_size, seq_len, d_model]
            query = self.query_layer(x)  # [batch_size, seq_len, d_model]
            value = self.value_layer(x)  # [batch_size, seq_len, d_model]
        else:
            key = self.K
            query = self.Q
            value = self.V

        key_heads = self.trans_to_multiple_heads(
            key)  # [batch_size, num_attention_heads, seq_len, d_k]
        query_heads = self.trans_to_multiple_heads(
            query)  # [batch_size, num_attention_heads, seq_len, d_k]
        value_heads = self.trans_to_multiple_heads(
            value)  # [batch_size, num_attention_heads, seq_len, d_k]

        attention_scores_before_normalize = torch.matmul(
            query_heads, key_heads.permute(0, 1, 3, 2))  # [batch_size, num_attention_heads, seq_len, seq_len]
        attention_scores = attention_scores_before_normalize / math.sqrt(self.d_k)

        attention_probs = F.softmax(attention_scores, dim=-1)

        result = torch.matmul(attention_probs,
                               value_heads)  # [batch_size, num_attention_heads, seq_len, d_k] 与 QKV 等大小
        result = result.permute(0, 2, 1, 3).contiguous()  # [batch_size, seq_len, num_attention_heads, d_k]
        new_size = result.size()[: -2] + (self.all_head_size,)  # [batch_size, seq_len, d_model]
        result = result.view(*new_size)

        print("attention_scores(before normalize) shape: ", attention_scores_before_normalize.shape)
        print("dk", self.d_k)
        print("attention_scores(after normalize) shape: ", attention_scores.shape)
        print("attention_probs shape(after softmax): ", attention_probs.shape)
        print("result shape: ", result.shape)
        return result


if __name__ == "__main__":
    # batch_size = 1
    # seq_len = 4
    # x_size = 16
    # input_size = 16
    # d_model = 16
    #
    # num_attention_heads = 1
    #
    # X = torch.Tensor(
    #     [[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],  # X1
    #      [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0],  # X2
    #      [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],  # X3
    #      [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]]  # X4
    # )  # [batch_size, seq_len, x_size]
    # x = X.view(batch_size, seq_len, x_size)
    # print("x: ", x)
    #
    # # embedding
    # i = torch.eye(d_model)  # 单位阵
    # w = torch.Tensor(
    #     [[1, 0, 0, 0],
    #      [0, 1, 0, 0],
    #      [0, 0, 1, 0],
    #      [0, 0, 0, 1], ]
    # )  # [x_size, input_size]
    # w = i
    # w = w.view(x_size, input_size)
    #
    # X = torch.matmul(X, w)  # [batch_size, seq_len, input_size]
    # print("a: ", X)
    #
    # # QKV
    # wq = torch.Tensor(
    #     [[1, 0, 1],
    #      [1, 0, 0],
    #      [0, 0, 1],
    #      [0, 1, 1],
    #      ]
    # )
    # wk = torch.Tensor(
    #     [[0, 0, 1],
    #      [1, 1, 0],
    #      [0, 1, 0],
    #      [1, 1, 0],
    #      ]
    # )
    # wv = torch.Tensor(
    #     [[0, 2, 0],
    #      [0, 3, 0],
    #      [1, 0, 3],
    #      [1, 1, 0],
    #      ]
    # )  # [input_size, d_model]
    # wq = i
    # wk = i
    # wv = i
    # wq = wq.view(input_size, d_model)
    # wk = wk.view(input_size, d_model)
    # wv = wv.view(input_size, d_model)
    #
    # Q = torch.matmul(X, wq)  # [batch_size, seq_len, d_model]
    # K = torch.matmul(X, wk)  # [batch_size, seq_len, d_model]
    # V = torch.matmul(X, wv)  # [batch_size, seq_len, d_model]
    # Q = Q.view(batch_size, seq_len, d_model)
    # K = K.view(batch_size, seq_len, d_model)
    # V = V.view(batch_size, seq_len, d_model)
    # print("Q: ", Q, "\nK: ", K, "\nV: ", V)
    #
    # # self attention
    # attention = selfAttention(num_attention_heads=num_attention_heads,  # num_attention_heads:注意力头的数量
    #                           input_size=input_size,  # input_size:输入向量的最后一个的维度
    #                           d_model=d_model,  # d_model:输出的维度,它会被分割成多个注意力头
    #                           Q=Q, K=K, V=V
    #                           )
    # result = attention.forward(X)
    # print("result: ", result)
    # print("result shape: ", result.shape)  # [batch_size, seq_len, d_model]

    features = torch.rand((32, 20, 10))
    attention = selfAttention(2, 10, 16)
    result = attention.forward(features)
    print(result.shape)

原理简介

Self-Attention Layer 基本结构如下:

image

image

image

image

1. 首先对于每个输入 x,经过 Embedding 层对每个输入进行编码得到 a1,a2,a3,a4

batch_size = 1
seq_len = 3
x_size = 4
input_size = 4         # embedding size
hidden_size = 3       # (q,k,v) size

num_attention_heads = 1

X = torch.Tensor(
    [[1,0,1,0],  # X1
     [0,2,0,2],  # X2
     [1,1,1,1]]  # X3
)  # [batch_size, seq_len, x_size]
x = X.view(batch_size, seq_len, x_size)
print("x: ", x)

w = torch.Tensor(
    [[1, 0, 0, 0],
     [0, 1, 0, 0],
     [0, 0, 1, 0],
     [0, 0, 0, 1], ]
)  # [x_size, input_size]
w = w.view(x_size, input_size)
X = torch.matmul(X, w)  # [batch_size, seq_len, input_size]
print("a: ", X)
x:  tensor([[[1., 0., 1., 0.],
         [0., 2., 0., 2.],
         [1., 1., 1., 1.]]])
a:  tensor([[1., 0., 1., 0.],
        [0., 2., 0., 2.],
        [1., 1., 1., 1.]])

2. 然后将输入特征经过三个全连接层分别得到 Query,Key,Value:

qi(Query)=Wqai

ki(Key)=Wkai

vi(Value)=Wvai

WqWvWv 先随机初始化,由网络迭代训练而来。

wq = torch.Tensor(
    [[1, 0, 1],
     [1, 0, 0],
     [0, 0, 1],
     [0, 1, 1],
     ]
)
wk = torch.Tensor(
    [[0, 0, 1],
     [1, 1, 0],
     [0, 1, 0],
     [1, 1, 0],
     ]
)
wv = torch.Tensor(
    [[0, 2, 0],
     [0, 3, 0],
     [1, 0, 3],
     [1, 1, 0],
     ]
)  # [input_size, hidden_size]

wq = wq.view(input_size, hidden_size)
wk = wk.view(input_size, hidden_size)
wv = wv.view(input_size, hidden_size)

Q = torch.matmul(X, wq)  # [batch_size, seq_len, hidden_size]
K = torch.matmul(X, wk)  # [batch_size, seq_len, hidden_size]
V = torch.matmul(X, wv)  # [batch_size, seq_len, hidden_size]
Q = Q.view(batch_size, seq_len, hidden_size)
K = K.view(batch_size, seq_len, hidden_size)
V = V.view(batch_size, seq_len, hidden_size)
print("Q: ", Q, "\nK: ", K, "\nV: ", V)
Q:  tensor([[[1., 0., 2.],
         [2., 2., 2.],
         [2., 1., 3.]]]) 
K:  tensor([[[0., 1., 1.],
         [4., 4., 0.],
         [2., 3., 1.]]]) 
V:  tensor([[[1., 2., 3.],
         [2., 8., 0.],
         [2., 6., 3.]]])

3. 计算注意力。注意力矩阵是由 Query 和 Key 计算得到,方式由许多种,如点积、缩放点积等。Value 可以看作是信息提取器,将根据单词的注意力提取一个唯一的值,也即某个特征有多少成分被提取出来。

Attention(Q,K,V)=softmax(QKT/dk)V

# attention
# Q k^T / sqrt(dk)
scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(hidden_size)  # [batch_size, seq_len, seq_len]
print("scores: \n", scores)
# softmax
attn_weights = F.softmax(scores, dim=-1)  # [batch_size, seq_len, seq_len]
print("attn_weights: \n", attn_weights)
# weighted sum
attn_output = torch.matmul(attn_weights, V)  # [batch_size, seq_len, hidden_size]
print("attn_output: \n", attn_output)
scores: 
 tensor([[[1.1547, 2.3094, 2.3094],
         [2.3094, 9.2376, 6.9282],
         [2.3094, 6.9282, 5.7735]]])
attn_weights: 
 tensor([[[1.3613e-01, 4.3194e-01, 4.3194e-01],
         [8.9045e-04, 9.0884e-01, 9.0267e-02],
         [7.4449e-03, 7.5471e-01, 2.3785e-01]]])
attn_output: 
 tensor([[[1.8639, 6.3194, 1.7042],
         [1.9991, 7.8141, 0.2735],
         [1.9926, 7.4796, 0.7359]]])

4. 多头注意力

在上述的 self-attention 中,我们最终只得到一个注意力矩阵,也就是说这个注意力矩阵所关注的信息只偏句子之间的一种关系,但是在时序序列中,往往特征之间不止一种关系,所以我们要提取多个注意力矩阵,这样可以捕获更多的信息,这种注意力机制也就是 多头注意力机制(Multi-Heads)。

MultiHead(Q,K,V)=Concat(head1,...,headh)WO

headi=Attention(QWiQ,KWiK,VWiV)

其中,WiQRdmodel×dk , WiKRdmodel×dk , WiVRdmodel×dv , WORhdv×dmodel

# multi-head attention
class selfAttention(nn.Module):
    def __init__(self, num_attention_heads, input_size, d_model, Q=None, K=None, V=None):
        super(selfAttention, self).__init__()
        if d_model % num_attention_heads != 0:
            raise ValueError(
                "the hidden size %d is not a multiple of the number of attention heads"
                "%d" % (d_model, num_attention_heads)
            )

        self.num_attention_heads = num_attention_heads
        self.d_k = int(d_model / num_attention_heads)  # dk = dv (Q, K, V 的最后一个维度)
        self.all_head_size = d_model  # 所有头加起来的总维度大小

        self.key_layer = nn.Linear(input_size, d_model, bias=False)  # input_size:输入向量的最后一个的维度,d_model:输出的维度,它会被分割成多个注意力头
        self.query_layer = nn.Linear(input_size, d_model, bias=False)
        self.value_layer = nn.Linear(input_size, d_model, bias=False)

        self.Q = Q
        self.K = K
        self.V = V

    def trans_to_multiple_heads(self, x):
        new_size = x.size()[: -1] + (self.num_attention_heads, self.d_k)
        x = x.view(new_size)
        return x.permute(0, 2, 1, 3)

    def forward(self, x):
        if self.Q is None or self.K is None or self.V is None:
            key = self.key_layer(x)  # [batch_size, seq_len, d_model]
            query = self.query_layer(x)  # [batch_size, seq_len, d_model]
            value = self.value_layer(x)  # [batch_size, seq_len, d_model]
        else:
            key = self.K
            query = self.Q
            value = self.V

        key_heads = self.trans_to_multiple_heads(
            key)  # [batch_size, num_attention_heads, seq_len, d_k]
        query_heads = self.trans_to_multiple_heads(
            query)  # [batch_size, num_attention_heads, seq_len, d_k]
        value_heads = self.trans_to_multiple_heads(
            value)  # [batch_size, num_attention_heads, seq_len, d_k]

        attention_scores_before_normalize = torch.matmul(
            query_heads, key_heads.permute(0, 1, 3, 2))  # [batch_size, num_attention_heads, seq_len, seq_len]
        attention_scores = attention_scores_before_normalize / math.sqrt(self.d_k)

        attention_probs = F.softmax(attention_scores, dim=-1)

        result = torch.matmul(attention_probs,
                               value_heads)  # [batch_size, num_attention_heads, seq_len, d_k] 与 QKV 等大小
        result = result.permute(0, 2, 1, 3).contiguous()  # [batch_size, seq_len, num_attention_heads, d_k]
        new_size = result.size()[: -2] + (self.all_head_size,)  # [batch_size, seq_len, d_model]
        result = result.view(*new_size)

        print("attention_scores(before normalize) shape: ", attention_scores_before_normalize.shape)
        print("dk", self.d_k)
        print("attention_scores(after normalize) shape: ", attention_scores.shape)
        print("attention_probs shape(after softmax): ", attention_probs.shape)
        print("result shape: ", result.shape)
        return result
    
# 随机输入一个 [batch, seq_len, input] 的向量测试一下
# 结果应该是 [batch_size, seq_len, d_model] 的tensor
x = torch.rand((32, 20, 10))
attention = selfAttention(2, 10, 16)
result = attention.forward(x)
print(result.shape)  # [32, 20, 16]: [batch_size, seq_len, d_model]
attention_scores(before normalize) shape:  torch.Size([32, 2, 20, 20])
dk 8
attention_scores(after normalize) shape:  torch.Size([32, 2, 20, 20])
attention_probs shape(after softmax):  torch.Size([32, 2, 20, 20])
result shape:  torch.Size([32, 20, 16])
torch.Size([32, 20, 16])

可视化

# 对一幅 8*8*1 的图片进行 self-attention
# 切成 4*4 的小块,每个小块 16 个像素点,共 4 个小块
# 0, 0, 0, 0  |  0, 0, 0, 0              |
# 0, 1, 1, 0  |  1, 1, 1, 1        X1    |   X2
# 0, 0, 0, 0  |  0, 0, 0, 0              |
# 0, 1, 1, 0  |  0, 1, 1, 0              |
# ---------------------------   ------------------
# 0, 0, 0, 0  |  0, 0, 0, 0              |
# 0, 1, 1, 0  |  1, 1, 1, 1        X3    |   X4
# 0, 0, 0, 0  |  0, 0, 0, 0              |
# 0, 1, 1, 0  |  0, 1, 1, 0              |

X = torch.Tensor(
    [[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],  # X1
     [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0],  # X2
     [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],  # X3
     [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]]  # X4
)  # [batch_size, seq_len, x_size]

batch_size = 1
seq_len = 4
x_size = 16
input_size = 16
d_model = 16
num_attention_heads = 1

x = X.view(batch_size, seq_len, x_size)
print("x: ", x)

# embedding
i = torch.eye(x_size)  # 单位阵
w = i
w = w.view(x_size, input_size)

X = torch.matmul(X, w)  # [batch_size, seq_len, input_size]
print("a: ", X)

# QKV
i = torch.eye(d_model)  # 单位阵
wq = i
wk = i
wv = i
wq = wq.view(input_size, d_model)
wk = wk.view(input_size, d_model)
wv = wv.view(input_size, d_model)

Q = torch.matmul(X, wq)  # [batch_size, seq_len, d_model]
K = torch.matmul(X, wk)  # [batch_size, seq_len, d_model]
V = torch.matmul(X, wv)  # [batch_size, seq_len, d_model]
Q = Q.view(batch_size, seq_len, d_model)
K = K.view(batch_size, seq_len, d_model)
V = V.view(batch_size, seq_len, d_model)
print("Q: ", Q, "\nK: ", K, "\nV: ", V)

# self attention
attention = selfAttention(num_attention_heads=num_attention_heads,  # num_attention_heads:注意力头的数量
                          input_size=input_size,  # input_size:输入向量的最后一个的维度
                          d_model=d_model,  # d_model:输出的维度,它会被分割成多个注意力头
                          Q=Q, K=K, V=V
                          )
result = attention.forward(X)
print("result: ", result)
print("result shape: ", result.shape)  # [batch_size, seq_len, d_model]

image

参考

https://blog.csdn.net/weixin_53598445/article/details/125009686
https://blog.csdn.net/baidu_33000721/article/details/137880166
https://www.bilibili.com/video/BV1Rz4y1g7J7
https://zhuanlan.zhihu.com/p/420820453

posted @   小··明  阅读(173)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示