transformer结构-position_encoding层

1 完整代码

import math

import torch
import torch.nn as nn

class PositionEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len):
        """
        d_model: 词嵌入维度
        max_len: 每个句子最大长度。
        """
        super(PositionEncoding, self).__init__()
        self.drop = nn.Dropout(dropout)
        # 1 初始化位置编码矩阵，元素为0
        pe = torch.zeros(max_len, d_model)
        # 2 初始化绝对位置矩阵，元素为词位置索引，从0开始。再变为2维张量。
        #  -> 5000*1
        self.abs_pe = torch.arange(0, max_len)[:, None]
        # print(self.abs_pe.shape, self.abs_pe[0,0:10])
        # torch.Size([5000]) tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

        tha = -(math.log(10000.0) / d_model)
        temp_array = torch.arange(0, d_model, 2)
        # print("tha === ", tha)
        # print("temp_array === ", temp_array.shape, temp_array)
        div_term = torch.exp(temp_array * tha)
        print(self.abs_pe.shape, div_term.shape)
        _ = self.abs_pe * div_term
        p_sin = torch.sin(_)
        p_cos = torch.cos(_)
        print(p_sin.shape, p_cos.shape)
        pe[:,0::2] = p_sin
        pe[:,1::2] = p_cos

        # 扩展一维，dTODO 为什么扩展1维呢？ => embedding层输出是batch * seq * dim
        # pe再扩展一维  变为3维 可以与之计算。方便相加计算。
        pe = torch.unsqueeze(pe, 0)
        print(pe.shape)
        # 存入buffer，不是模型参数，不随优化器进行梯度更新。
        # 随模型参数一起保存，加载模型时候随参数一起加载。
        self.register_buffer("pe", pe)

        # --------------------纯练习 ---------------
        # print(div_term.shape)
        # print(self.abs_pe.shape)
        # div_term = div_term[None,:].repeat(5000, 1)
        # print(div_term.shape)
        # self.abs_pe = self.abs_pe.to(torch.float32)
        # # div_term = div_term.to(torch.int64)
        # print(self.abs_pe.dtype, div_term.dtype)
        # print(div_term[0], "\n", div_term[1])
        # _ = self.abs_pe @ div_term
        # p_sin = torch.sin(_)
        # p_cos = torch.cos(_)
        # print(p_sin.shape, p_cos.shape)
        # self.pe[:,0::2] = p_sin
        # self.pe[:,1::2] = p_cos
        # --------------------纯练习 ---------------

        pass

    def forward(self, x):
        """
        X 为embedding 向量 3维
        与x的leng 相同的 positionencoding
        """
        # print("pe === ",self.pe.shape)
        if not self.pe.requires_grad:
            self.pe.requires_grad = False
        # note  维度还是3维
        print("x === ", x.requires_grad)
        # 应该是可以广播的， [2,3,5]维度tensor + [1,3,5] 维度tensor
        x = x + self.pe[:, :x.size(1)]
        out = self.drop(x)
        return out
        pass


d_model, dropout, max_len = 300, 0, 5000
pe = PositionEncoding(d_model, dropout, max_len)
embedding = torch.randn(3, 100, 300, requires_grad=True)
print(embedding.shape, embedding[1,10:15,3:6])
out = pe(embedding)
print("out === ", out.shape)

positionencoding

2 细节总结

2.1 3*1 维tensor可以乘以一维度的张量比如 shape = 6的，默认变为1*6

　　3*1 张量 * 1*6 张量

>>> import torch
>>> a=torch.tensor([[1],[2],[3]])
>>> a.shape
torch.Size([3, 1])
>>> c = torch.arange(0,6)
>>> c
tensor([0, 1, 2, 3, 4, 5])
>>> c.shape
torch.Size([6])
>>> a*c
tensor([[ 0,  1,  2,  3,  4,  5],
        [ 0,  2,  4,  6,  8, 10],
        [ 0,  3,  6,  9, 12, 15]])

相乘结果

2.2 2*3*7张量与1*3*7张量可以相加

　　相加为广播

>>> a=torch.tensor([[[1],[2],[3]],[[1],[2],[3]]])
>>> a
tensor([[[1],
         [2],
         [3]],

        [[1],
         [2],
         [3]]])
>>> a.shape
torch.Size([2, 3, 1])
>>> a[:,:1]
tensor([[[1]],

        [[1]]])
>>> aa=a[:,:1]
>>> aa.shape
torch.Size([2, 1, 1])
>>> a
tensor([[[1],
         [2],
         [3]],

        [[1],
         [2],
         [3]]])
>>> a.shape
torch.Size([2, 3, 1])
>>> c = torch.ones(1,3)
>>> c
tensor([[1., 1., 1.]])
>>> c.shape
torch.Size([1, 3])
>>> c = c[..., None]
>>> c.shape
torch.Size([1, 3, 1])
>>> a+c
tensor([[[2.],
         [3.],
         [4.]],

        [[2.],
         [3.],
         [4.]]])

相加

2.3 register_buffer

作用：如果真的需要自定义一个函数或算法，还一定要保存在模型中， (不用更新参数的)，就这样做。

　　　　forward中就可以用self.pe就可以调用。

2.4 position encoding与embedding关系

　　两个相加计算。

2.5 position encoding

引入的公式：

　　在Transformer模型中，位置编码（Positional Encoding）是为了在没有序列顺序信息的注意力机制中引入位置信息。这样模型就可以根据位置来更好地理解输入序列中各个词之间的关系。

　　在Transformer的位置编码中，通常使用正弦和余弦函数来表示不同位置的位置编码。具体来说，位置编码的公式如下：

通过这个公式，可以为每个位置分配一个唯一的位置编码，这样模型就能够通过位置编码更好地理解输入序列中不同位置的词之间的关系。

posted on 2024-04-05 13:28 lexn 阅读(97) 评论(0) 编辑收藏举报

刷新页面返回顶部