transformer详细结构及keras实现
文章目录
本文主要整理来自tensorflow官方教程中Transformer 模型,需要看原文的点击这里:
https://www.tensorflow.org/tutorials/text/transformer
transformer模型结构
transformer和一般的seq2seq模型一样,都是由编码器encoder和解码器decoder两部分组成。在结构上transformer完全抛弃了RNN、CNN基本架构,全部使用self-attention完成网络构建。
位置编码(position encoding)
transformer模型不同与RNN模型,RNN天然就有位置信息,transformer中通过额外输入每个时刻的位置信息。通过sin和cos函数交替生成位置编码信息。
P
E
(
p
o
s
,
2
i
)
=
sin
p
o
s
1000
0
2
i
d
m
o
d
e
l
PE_{(pos,2i)} = \sin \frac{pos}{10000^{\frac{2i}{d_{model}}}}
PE(pos,2i)=sin10000dmodel2ipos
P
E
(
p
o
s
,
2
i
+
1
)
=
cos
p
o
s
1000
0
2
i
d
m
o
d
e
l
PE_{(pos,2i+1)} = \cos \frac{pos}{10000^{\frac{2i}{d_{model}}}}
PE(pos,2i+1)=cos10000dmodel2ipos
import tensorflow as tf
# 位置编码信息
def positional_embedding(maxlen, model_size):
PE = np.zeros((maxlen, model_size))
for i in range(maxlen):
for j in range(model_size):
if j % 2 == 0:
PE[i, j] = np.sin(i / 10000 ** (j / model_size))
else:
PE[i, j] = np.cos(i / 10000 ** ((j-1) / model_size))
PE = tf.constant(PE, dtype=tf.float32)
return PE
position encoding讲解详细可以参考:
如何理解Transformer论文中的positional encoding,和三角函数有什么关系?
多头注意力(multi-head attention)
attention注意力函数可以看成是将一个输出向量映射成一个查询向量query和一组键key-值value向量对。输出向量为这些值向量的加权求和,其中每个值向量的权重由查询向量和值对应的键向量计算得出。attention可由以下形式表示:
a
t
t
_
o
u
t
p
u
t
=
A
t
t
e
n
t
i
o
n
(
Q
,
K
,
V
)
=
s
o
f
t
m
a
x
(
Q
K
T
d
k
)
V
att\_output = Attention(Q,K,V) = softmax(\frac{QK^T}{\sqrt{d_k}})V
att_output=Attention(Q,K,V)=softmax(dkQKT)V
multi-head attention则是通过h个不同的线性变换对Q,K,V进行投影,最后将不同的attention结果拼接起来:
M
u
l
t
i
H
e
a
d
(
Q
,
K
,
V
)
=
C
o
n
c
a
t
(
h
e
a
d
1
,
.
.
.
,
h
e
a
d
h
)
W
o
MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^o
MultiHead(Q,K,V)=Concat(head1,...,headh)Wo
h
e
a
d
i
=
A
t
t
e
n
t
i
o
n
(
Q
W
i
Q
,
K
W
i
K
,
V
W
i
V
)
head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
headi=Attention(QWiQ,KWiK,VWiV)
在self-attention则是取Q,K,V相同。多头注意力使模型联合感知不同位置的不同特征表征。
import tensorflow as tf
from tensorflow import keras
class MultiHeadAttention(keras.Model):
def __init__(self, model_size, num_heads, **kwargs):
super(MultiHeadAttention, self).__init__(**kwargs)
self.model_size = model_size
self.num_heads = num_heads
self.head_size = model_size // num_heads
self.WQ = keras.layers.Dense(model_size, name="dense_query")
self.WK = keras.layers.Dense(model_size, name="dense_key")
self.WV = keras.layers.Dense(model_size, name="dense_value")
self.dense = keras.layers.Dense(model_size)
def call(self, query, key, value, mask):
# query: (batch, maxlen, model_size)
# key : (batch, maxlen, model_size)
# value: (batch, maxlen, model_size)
batch_size = tf.shape(query)[0]
# shape: (batch, maxlen, model_size)
query = self.WQ(query)
key = self.WK(key)
value = self.WV(value)
def _split_heads(x):
x = tf.reshape(x, shape=[batch_size, -1, self.num_heads, self.head_size])
return tf.transpose(x, perm=[0, 2, 1, 3])
# shape: (batch, num_heads, maxlen, head_size)
query = _split_heads(query)
key = _split_heads(key)
value = _split_heads(value)
# shape: (batch, num_heads, maxlen, maxlen)
matmul_qk = tf.matmul(query, key, transpose_b=True)
# 缩放 matmul_qk
dk = tf.cast(query.shape[-1], tf.float32)
score = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
# mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype=tf.float32)
score += (1 - mask) * -1e9
alpha = tf.nn.softmax(score)
context = tf.matmul(alpha, value)
context = tf.transpose(context, perm=[0, 2, 1, 3])
context = tf.reshape(context, (batch_size, -1, self.model_size))
output = self.dense(context)
return output
编码器(Encoder)
Encoder由N=6个相同的Layer组成,这里Layer指的开始结构图左侧的单元。
每个Layer由两个sub-layer组成,分别是multi-head self-attention和fully connected feed-forward network。其中全连接层为模型提供非线性变换。
其中每个sub-layer都加了residual connection和layer normalization。
s
u
b
l
a
y
e
r
_
o
u
t
p
u
t
=
L
a
y
e
r
N
o
r
m
(
x
+
S
u
b
L
a
y
e
r
(
x
)
)
sublayer\_output = LayerNorm(x+SubLayer(x))
sublayer_output=LayerNorm(x+SubLayer(x))
所以Encoder主要由3部分组成:
- 输入层:token embedding + position embedding
- muti-head attention:muti-head self-attention + add&norm(残差网络+layer normalization)
- 全连接层:position-wise feed-forward network(全连接层)+ add&norm
position-wise feed-forward network
# position-wise feed forward network
class FeedForwardNetwork(keras.Model):
def __init__(self, dff_size, model_size):
super(FeedForwardNetwork, self).__init__()
self.dense1 = keras.layers.Dense(dff_size, activation="relu")
self.dense2 = keras.layers.Dense(model_size)
def call(self, x):
x = self.dense1(x)
x = self.dense2(x)
return x
Encoder Layer
# Encoder Layer层
class EncoderLayer(keras.layers.Layer):
def __init__(self, model_size, num_heads, dff_size, rate=0.1):
super(EncoderLayer, self).__init__()
self.attention = MultiHeadAttention(model_size, num_heads)
self.ffn = FeedForwardNetwork(dff_size, model_size)
# Layer Normalization
self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
def call(self, x, training, mask):
# multi head attention
attn_output = self.attention(x, x, x, mask)
attn_output = self.dropout1(attn_output, training=training)
# residual connection
out1 = self.layernorm1(x + attn_output)
# ffn layer
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
# Residual connection
out2 = self.layernorm2(out1 + ffn_output)
return out2
多层Encoder
# 多层Encoder
class Encoder(keras.Model):
def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
super(Encoder, self).__init__()
self.model_size = model_size
self.num_layers = num_layers
self.embedding = keras.layers.Embedding(vocab_size, model_size)
self.pos_embedding = positional_embedding(maxlen, model_size)
self.encoder_layers = [EncoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
self.dropout = keras.layers.Dropout(rate)
def call(self, x, training, padding_mask):
# input embedding + positional embedding
x = self.embedding(x) + self.pos_embedding
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.encoder_layers[i](x, training, padding_mask)
return x
解码器(Decoder)
Decoder和Encoder类似,只是多了一个attention的sub_layer,具体是解码输入中加入了Masked Multi-Head Attention。所以Decoder由4分布组成:
- 输入层:token embedding + position embedding
- mask muti-head层:mask muti-head self-attention(look ahead mask)+ add&norm
- muti-head attention层:muti-head context-attention (encoder-decoder attention)+ add&norm
- 全连接层:position-wise feed-forward network + add&norm
Decoder Layer
# Decoder Layer
class DecoderLayer(keras.layers.Layer):
def __init__(self, model_size, num_heads, dff_size, rate=0.1):
super(DecoderLayer, self).__init__()
self.mask_attention = MultiHeadAttention(model_size, num_heads, causal=True)
self.attention = MultiHeadAttention(model_size, num_heads)
self.ffn = FeedForwardNetwork(dff_size, model_size)
self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
self.dropout3 = layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
attn_decoder = self.mask_attention(x, x, x, look_ahead_mask)
attn_decoder = self.dropout1(attn_decoder, training=training)
out1 = self.layernorm1(x + attn_decoder)
attn_encoder_decoder = self.attention(out1, enc_output, enc_output, padding_mask)
attn_encoder_decoder = self.dropout2(attn_encoder_decoder, training=training)
out2 = self.layernorm2(out1 + attn_encoder_decoder)
ffn_output = self.ffn(out2)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(out2 + ffn_output)
return out3
多层Decoder
# 多层Decoder
class Decoder(keras.Model):
def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
super(Decoder, self).__init__()
self.model_size = model_size
self.num_layers = num_layers
self.embedding = keras.layers.Embedding(vocab_size, model_size)
self.pos_embedding = positional_embedding(maxlen, model_size)
self.decoder_layers = [DecoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
self.dropout = keras.layers.Dropout(rate)
def call(self, enc_output, x, training, look_ahead_mask, padding_mask):
# input embedding + positional embedding
x = self.embedding(x) + self.pos_embedding
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.decoder_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
return x
Transformer
模型输入需要做一些填充,padding填充mask和decode mask操作。
# padding填充mask
def padding_mask(seq):
mask = tf.cast(tf.math.not_equal(seq, 0), dtype=tf.float32)
mask = mask[:, tf.newaxis, tf.newaxis, :]
return mask
'''
-------------------
>> inputs = tf.constant([[1,2,3,0],[4,5,0,0]])
>> mask = padding_mask(inputs)
tf.Tensor(
[[[[1. 1. 1. 0.]]]
[[[1. 1. 0. 0.]]]], shape=(2, 1, 1, 4), dtype=float32)
-------------------
'''
# decode mask
def look_ahead_mask(size):
ahead_mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
ahead_mask = tf.cast(ahead_mask, dtype=tf.float32)
return ahead_mask
'''
-------------------
>> inputs = tf.constant(4)
>> mask = look_ahead_mask(inputs)
<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 1.]], dtype=float32)>
-------------------
'''
def create_mask(inp, tar):
enc_padding_mask = padding_mask(inp)
dec_padding_mask = padding_mask(tar)
ahead_mask = look_ahead_mask(tf.shape(tar)[1])
combined_mask = tf.minimum(dec_padding_mask, ahead_mask)
return enc_padding_mask, dec_padding_mask, combined_mask
Encoder和Decoder组合成Transformer
# Encoder和Decoder组合成Transformer
class Transformer(keras.Model):
def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, training=True, rete=0.1):
super(Transformer, self).__init__()
self.training = training
self.encoder = Encoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
self.decoder = Decoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
self.final_dense = keras.layers.Dense(vocab_size, name="final_output")
def call(self, all_inputs):
sources, targets = all_inputs
enc_padding_mask, dec_padding_mask, combined_mask = create_mask(sources, targets)
enc_output = self.encoder(sources, self.training, enc_padding_mask)
dec_output = self.decoder(enc_output, targets, self.training, combined_mask , dec_padding_mask)
final_output = self.final_dense(dec_output)
return final_output
测试构建模型
num_layers = 4
model_size = 768
num_heads = 12
dff_size = 1024
maxlen = 10
vocab_size = 10000
enc_inputs = keras.layers.Input(shape=(maxlen,), name="enc_input")
dec_inputs = keras.layers.Input(shape=(maxlen,), name="dec_input")
dec_outputs = keras.layers.Input(shape=(maxlen,), name="dec_output")
transformer = Transformer(num_layers=num_layers,
model_size=model_size,
num_heads=num_heads,
dff_size=dff_size,
vocab_size=vocab_size,
maxlen=maxlen)
final_output = transformer([enc_inputs, dec_inputs])
model = keras.models.Model(inputs=[enc_inputs, dec_inputs], outputs=final_output)
print(model.summary())
应用与其他
机器翻译、文本生成、推荐系统。。。待更新~
参考文档
[1]. Transformer模型原理详解
[2]. Attention机制详解(二)——Self-Attention与Transformer
[3]. Attention Is All You Need
[4]. 理解语言的 Transformer 模型