【文本摘要项目】6-性能提升之Transformer模型
背景
前一篇文章中,采用了在预训练模型出现之前,比较经典的一款模型PGN,本文基于上一篇文章的内容,继续对模型表现性能进行提升。本篇采用的提升模型是Transformer模型。其原理部分,已在其他文章介绍过,本文重在其代码实现部分。
核心内容
整体流程
整个项目的大体流程,如数据加载、训练流程、测试流程等结构,和前面的模型介绍基本相同,因此本文着重介绍Transformer模型的各个细节部分的编码实现,如Embedding、Encoder/Decoder,及Encoder中self-Attention、Residual Network、Layer Norm、FFN等,简要介绍Decoder中相似结构。
整体模型
先搭建模型的大体框架:Encoder、Decoder、以及输出层。
class TRANSFORMER(tf.keras.Model):
def __init__(self, params):
super(PGN_TRANSFORMER, self).__init__()
self.num_blocks = params["num_blocks"]
self.batch_size = params["batch_size"]
self.vocab_size = params["vocab_size"]
self.num_heads = params["num_heads"]
# Encoder 部分
self.encoder = Encoder(params["num_blocks"],
params["d_model"],
params["num_heads"],
params["dff"],
params["vocab_size"],
params["dropout_rate"])
# Decoder部分
self.decoder = Decoder(params["num_blocks"],
params["d_model"],
params["num_heads"],
params["dff"],
params["vocab_size"],
params["dropout_rate"])
# 模型输出
self.final_layer = tf.keras.layers.Dense(params["vocab_size"])
# 模型输出
def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
# dec_output.shape == (batch_size, tar_seq_len, d_model)
dec_output, attention_weights, p_gens = self.decoder(tar,
enc_output,
training,
look_ahead_mask,
dec_padding_mask)
final_output = self.final_layer(dec_output)
# (batch_size, tar_seq_len, target_vocab_size)
final_output = tf.nn.softmax(final_output)
attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_blocks)]
# (batch_size,num_heads, targ_seq_len, inp_seq_len)
attn_dists = tf.reduce_sum(attn_dists, axis=1) / self.num_heads
# outputs = dict(logits=tf.stack(final_dists, 1), attentions=attn_dists)
outputs = dict(logits=final_output, attentions=attn_dists)
return outputs
Encoder部分整体架构
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = Embedding(input_vocab_size, d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
x = self.embedding(x)
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
# (batch_size, input_seq_len, d_model)
return x
Encoder部分又包括若干个编码层EncoderLayer,Transformer的编码和解码部分都可以由若干个层堆叠而成。其中,d_model为每个词被编码成的维度dim;num_layers表示堆叠的EncoderLayer数目;dff为前馈神经网络的输出维度。
EncoderLayer
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
Transformer中每个EnocderLayer由(多头)注意力层、标准化、前馈网络层。其中,每一个Norm层中,使用的是残差网络(Residual Network),
self-Attention层
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
"""分拆最后一个维度到 (num_heads, depth).
转置结果使得形状为 (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v) # (batch_size, seq_len, d_model)
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
其中,scaled_dot_product_attention为attention分数值计算函数,样例代码如下:
def scaled_dot_product_attention(q, k, v, mask):
"""计算注意力权重。
q, k, v 必须具有匹配的前置维度。
k, v 必须有匹配的倒数第二个维度,例如:seq_len_k = seq_len_v。
虽然 mask 根据其类型(填充或前瞻)有不同的形状,
但是 mask 必须能进行广播转换以便求和。
参数:
q: 请求的形状 == (..., seq_len_q, depth)
k: 主键的形状 == (..., seq_len_k, depth)
v: 数值的形状 == (..., seq_len_v, depth_v)
mask: Float 张量,其形状能转换成
(..., seq_len_q, seq_len_k)。默认为None。
返回值:
输出,注意力权重
"""
matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
# 缩放 matmul_qk
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# 将 mask 加入到缩放的张量上。
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# softmax 在最后一个轴(seq_len_k)上归一化,因此分数
# 相加等于1。
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
return output, attention_weights
FFN
def point_wise_feed_forward_network(d_model, dff):
return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
tf.keras.layers.Dense(d_model)]) # (batch_size, seq_len, d_model)
Decoder部分整体结构
class Decoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.num_heads = num_heads
self.depth = self.d_model // self.num_heads
self.embedding = Embedding(target_vocab_size, d_model)
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
attention_weights = {}
x = self.embedding(x)
out = self.dropout(x, training=training)
for i in range(self.num_layers):
out, block1, block2 = self.dec_layers[i](out, enc_output, training,
look_ahead_mask, padding_mask)
attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
# x.shape == (batch_size, target_seq_len, d_model)
p_gens = None
return out, attention_weights, p_gens
DecoderLayer
Decoder部分整体结构中的各个组件和Encoder基本相同。其中,Decoder也是由若干个层堆叠而成,每个层为DecoderLayer。样例代码如下:
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
self.dropout3 = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
# enc_output.shape == (batch_size, input_seq_len, d_model)
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layernorm1(attn1 + x)
attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
return out3, attn_weights_block1, attn_weights_block2
DecoderLayer中每个构建也和EncoderLayer基本相同,其中在标准化层,同样使用的是残差网络。
Position Embedding
Transformer主要构成中的另一部分,是对输入数据的编码;输入部分的编码,分为词向量编码和位置编码。
class Embedding(tf.keras.layers.Layer):
def __init__(self, vocab_size, d_model):
super(Embedding, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
self.pos_encoding = positional_encoding(vocab_size, d_model)
def call(self, x):
embed_x = self.embedding(x) # (batch_size, target_seq_len, d_model)
embed_x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
embed_x += self.pos_encoding[:, :tf.shape(x)[1], :]
return embed_x
positional_encoding为位置编码信息,词向量编码不在使用预训练好的词向量,而是直接使用可训练的Embedding层进行表示。
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# 将 sin 应用于数组中的偶数索引(indices);2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# 将 cos 应用于数组中的奇数索引;2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
return pos * angle_rates