Self-Attention 实例 pytorch

BERT模型入门系列(三):Self-Attention详解 - 知乎 (zhihu.com)

读了不少书,看了不少视频,感觉这片文章最适合入门。

简洁清晰,例子好懂。

为什么需要self-attention模型?1、训练速度受限 2、处理长文本能力弱

 

 

 

 

 

 The Illustrated Transformer – Jay Alammar – Visualizing machine learning one concept at a time. (jalammar.github.io)

计算过程

1、计算Q(查询向量Quey)、K(键向量)、Value(值向量)

2、计算注意力权重,这里使用点积来作为注意力打分函数

3、计算输出向量序列

 

详细步骤请参考原文:BERT模型入门系列(三):Self-Attention详解 - 知乎 (zhihu.com)

原文程序貌似TensorFlow写的,这里用pytorch写一下。

import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F


# https://blog.csdn.net/weixin_53598445/article/details/125009686
# https://zhuanlan.zhihu.com/p/345280272


class selfAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(selfAttention, self).__init__()
        self.key_layer = nn.Linear(input_size, hidden_size)
        self.query_layer = nn.Linear(input_size, hidden_size)
        self.value_layer = nn.Linear(input_size, hidden_size)

    def forward(self, x, w_k, w_q, w_v):
        self.key_layer.weight.data = w_k.mT     # 初始化key参数,可以直接学习
        self.key_layer.bias.data = torch.Tensor([0.0])
        key = self.key_layer(x)
        self.query_layer.weight.data = w_q.mT
        self.query_layer.bias.data = torch.Tensor([0.0])
        query = self.query_layer(x)
        self.value_layer.weight.data = w_v.mT
        self.value_layer.bias.data = torch.Tensor([0.0])
        value = self.value_layer(x)
        print('key:\n', key)
        print('query:\n', query)
        print('value:\n', value)

        attention_scores = torch.matmul(query, key.mT)  # query * (key的转置)
        print('query * (key的转置):\n', attention_scores)

        attention_softmax = F.softmax(attention_scores, dim=-1)  # dim=n参数用来指定第n个维度的和为1
        torch.set_printoptions(precision=2, sci_mode=False)  # 显示小数点后的位数
        print('注意力权重:\n', attention_softmax)

        h1 = value[0][0] * attention_softmax[0][0][0] \
             + value[0][1] * attention_softmax[0][0][1] \
             + value[0][2] * attention_softmax[0][0][2]

        h2 = value[0][0] * attention_softmax[0][1][0] \
             + value[0][1] * attention_softmax[0][1][1] \
             + value[0][2] * attention_softmax[0][1][2]

        h3 = value[0][0] * attention_softmax[0][2][0] \
             + value[0][1] * attention_softmax[0][2][1] \
             + value[0][2] * attention_softmax[0][2][2]

        print('输出向量序列:')
        print(h1)
        print(h2)
        print(h3)

        return 0


features = torch.tensor([[[1, 0, 1, 0],
                          [0, 2, 0, 2],
                          [1, 1, 1, 1]]], dtype=torch.float)

wk = torch.tensor([[0, 0, 1],
                   [1, 1, 0],
                   [0, 1, 0],
                   [1, 1, 0]], dtype=torch.float)

wq = torch.tensor([[1, 0, 1],
                   [1, 0, 0],
                   [0, 0, 1],
                   [0, 1, 1]], dtype=torch.float)
wv = torch.tensor([[0, 2, 0],
                   [0, 3, 0],
                   [1, 0, 3],
                   [1, 1, 0]], dtype=torch.float)

attention = selfAttention(4, 3)
attention.forward(features, wk, wq, wv)

运行结果,与图片计算基本一致(精度不同,略有差异)

key:
 tensor([[[0., 1., 1.],
         [4., 4., 0.],
         [2., 3., 1.]]], grad_fn=<AddBackward0>)
query:
 tensor([[[1., 0., 2.],
         [2., 2., 2.],
         [2., 1., 3.]]], grad_fn=<AddBackward0>)
value:
 tensor([[[1., 2., 3.],
         [2., 8., 0.],
         [2., 6., 3.]]], grad_fn=<AddBackward0>)
query * (key的转置):
 tensor([[[ 2.,  4.,  4.],
         [ 4., 16., 12.],
         [ 4., 12., 10.]]], grad_fn=<UnsafeViewBackward0>)
注意力权重:
 tensor([[[    0.06,     0.47,     0.47],
         [    0.00,     0.98,     0.02],
         [    0.00,     0.88,     0.12]]], grad_fn=<SoftmaxBackward0>)
输出向量序列:
tensor([1.94, 6.68, 1.60], grad_fn=<AddBackward0>)
tensor([2.00, 7.96, 0.05], grad_fn=<AddBackward0>)
tensor([2.00, 7.76, 0.36], grad_fn=<AddBackward0>)

 

不初始化,通过学习得到wq,wk,wv,源代码如下:

import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F


# https://blog.csdn.net/weixin_53598445/article/details/125009686
# https://zhuanlan.zhihu.com/p/345280272


class selfAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(selfAttention, self).__init__()
        self.key_layer = nn.Linear(input_size, hidden_size)
        self.query_layer = nn.Linear(input_size, hidden_size)
        self.value_layer = nn.Linear(input_size, hidden_size)

    def forward(self, x):
        key = self.key_layer(x)
        query = self.query_layer(x)
        value = self.value_layer(x)
        print('key:\n', key)
        print('query:\n', query)
        print('value:\n', value)

        attention_scores = torch.matmul(query, key.mT)  # query * (key的转置)
        print('query * (key的转置):\n', attention_scores)

        attention_softmax = F.softmax(attention_scores, dim=-1)  # dim=n参数用来指定第n个维度的和为1
        torch.set_printoptions(precision=2, sci_mode=False)  # 显示小数点后的位数
        print('注意力权重:\n', attention_softmax)

        h1 = value[0][0] * attention_softmax[0][0][0] \
             + value[0][1] * attention_softmax[0][0][1] \
             + value[0][2] * attention_softmax[0][0][2]

        h2 = value[0][0] * attention_softmax[0][1][0] \
             + value[0][1] * attention_softmax[0][1][1] \
             + value[0][2] * attention_softmax[0][1][2]

        h3 = value[0][0] * attention_softmax[0][2][0] \
             + value[0][1] * attention_softmax[0][2][1] \
             + value[0][2] * attention_softmax[0][2][2]

        print('输出向量序列:')
        print(h1)
        print(h2)
        print(h3)

        return 0


features = torch.tensor([[[1, 0, 1, 0],
                          [0, 2, 0, 2],
                          [1, 1, 1, 1]]], dtype=torch.float)

attention = selfAttention(4, 3)
attention.forward(features)

输出结果:

key:
 tensor([[[ 0.0680,  0.2645,  0.0556],
         [-1.2327, -0.1178,  0.3482],
         [-0.6597,  0.1382,  0.1653]]], grad_fn=<AddBackward0>)
query:
 tensor([[[-0.0121, -0.0466, -0.2353],
         [-0.2424,  0.3289, -0.2127],
         [-0.1471,  0.0114, -0.2089]]], grad_fn=<AddBackward0>)
value:
 tensor([[[-0.3317, -0.3424, -0.6434],
         [ 0.4560,  1.2522, -1.7553],
         [-0.0401,  0.1366, -1.2749]]], grad_fn=<AddBackward0>)
query * (key的转置):
 tensor([[[-0.0262, -0.0615, -0.0373],
         [ 0.0587,  0.1860,  0.1702],
         [-0.0186,  0.1072,  0.0641]]], grad_fn=<UnsafeViewBackward0>)
注意力权重:
 tensor([[[0.34, 0.33, 0.33],
         [0.31, 0.35, 0.34],
         [0.31, 0.35, 0.34]]], grad_fn=<SoftmaxBackward0>)
输出向量序列:
tensor([ 0.02,  0.34, -1.22], grad_fn=<AddBackward0>)
tensor([ 0.04,  0.38, -1.25], grad_fn=<AddBackward0>)
tensor([ 0.04,  0.38, -1.25], grad_fn=<AddBackward0>)

进程已结束,退出代码0

posted on 2022-06-22 22:55  HBU_DAVID  阅读(452)  评论(0编辑  收藏  举报

导航