tf2.0 实现transformer小改
import sys # import keras import numpy as np import tensorflow as tf import matplotlib.pyplot as plt from tensorflow.keras import layers import os import matplotlib.pyplot as plt # 设置相关底层配置 physical_devices = tf.config.experimental.list_physical_devices('GPU') assert len(physical_devices) > 0, "Not enough GPU hardware devices available" tf.config.experimental.set_memory_growth(physical_devices[0], True) def my_padding(x,size): result = np.array([]) bat_size = x.shape[0] for i in range(bat_size): zero_mat = np.zeros((size - x.shape[1],x.shape[2])) start_mat = np.ones((1,x.shape[2])) * 0.3 end_mat = np.ones((1,x.shape[2])) * 0.3 each_x = x[i] each_x = np.vstack([start_mat,each_x]) pad_x = np.vstack([each_x,zero_mat]) pad_x = np.vstack([pad_x,end_mat]) pad_x = pad_x.reshape(1, size+2, x.shape[2]) if result.shape[0] == 0: result = pad_x else: result = np.vstack([result,pad_x]) return result # test_x = np.array([ # [[111, 112, 113], # [121, 122, 123], # ], # [[211, 212, 213], # [221, 222, 223], # ], # [[311, 312, 313], # [321, 322, 323], # ], # ]) # a = my_padding(test_x,6) # print(a.shape) # print(a) # sys.exit(2) def get_angles(pos, i, d_model): # 这里的i等价与上面公式中的2i和2i+1 angle_rates = 1 / np.power(10000, (2*(i // 2))/ np.float32(d_model)) return pos * angle_rates def positional_encoding(position, d_model): # print('position:',position) angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) # 第2i项使用sin sines = np.sin(angle_rads[:, 0::2]) # 第2i+1项使用cos cones = np.cos(angle_rads[:, 1::2]) pos_encoding = np.concatenate([sines, cones], axis=-1) pos_encoding = pos_encoding[np.newaxis, ...] return tf.cast(pos_encoding, dtype=tf.float32) # pos_encoding = positional_encoding(50, 512) # def create_padding_mark(seq): # # 获取为0的padding项 # # print('seq:', seq) # seq = tf.cast(tf.math.equal(seq, 0), tf.float32) # # 扩充维度以便用于attention矩阵 # return seq[:, np.newaxis, np.newaxis, :] # (batch_size,1,1,seq_len) # # print('seq_mask:',seq) # return seq # (batch_size,seq_len) def create_padding_mark(targets): # 获取为0的padding项 # print('targets:',targets.shape) zero_mask = np.max(targets,axis=-1) zero_mask = tf.cast(tf.math.equal(zero_mask, 0), tf.float32) zero_mask = tf.reshape(zero_mask,(-1,1)) one_mat = np.ones((targets.shape[0],targets.shape[0])) # print('zero_mask:',zero_mask.shape,zero_mask) result_mask1 = np.multiply(one_mat,zero_mask) result_mask2 = np.multiply(one_mat, tf.transpose(zero_mask)) result_mask = tf.maximum(result_mask1,result_mask2) # print('result_mask:', result_mask.shape, result_mask) # sys.exit(2) return result_mask # (batch_size,seq_len) # mark 测试 # create_padding_mark([[1,2,0,0,3],[3,4,5,0,0],[0,0,0,0,0]]) # sys.exit(2) def create_look_ahead_mark(size): # print('size:',size) # 1 - 对角线和取下三角的全部对角线(-1->全部) # 这样就可以构造出每个时刻未预测token的掩码 mark = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) # (a, numLower, numUpper) # print('mark.shapeL:',mark.shape) # sys.exit(2) # mark = np.reshape(mark,()) mark = tf.cast(mark,tf.float32) return mark # (seq_len, seq_len) # temp = create_look_ahead_mark(3) # print(temp) # sys.exit(2) # 构建掩码 def create_mask(targets): # loss_mark = np.max(targets,axis=-1) # loss_mark = tf.cast(tf.math.equal(loss_mark, 0), tf.float32) # loss_mark = tf.reshape(loss_mark,(-1,1)) # # print('loss_mark:',loss_mark,loss_mark.shape) # decode_targets_padding_mask = create_padding_mark(targets) # decode_targets_padding_mask = tf.cast(decode_targets_padding_mask,tf.float32) # look_ahead 掩码, 掩掉未预测的词 look_ahead_mask = create_look_ahead_mark(targets.shape[0]) look_ahead_mask = tf.cast(look_ahead_mask , tf.float32) # print('此粗话'k:', look_ahead_mask.shape,type(look_ahead_mask.shape)) # 合并解码层第一层掩码 # combine_mask = tf.maximum(decode_targets_padding_mask, look_ahead_mask) # print('combine_m # return combine_mask, loss_mark return look_ahead_mask # sys.exit(2) def scaled_dot_product_attention(q, k, v, mask): # query key 相乘获取匹配关系 matmul_qk = tf.matmul(q, k, transpose_b=True) # 使用dk进行缩放 dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) # 掩码 if mask is not None: scaled_attention_logits += (mask * -1e9) # 通过softmax获取attention权重 attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # attention 乘上value output = tf.matmul(attention_weights, v) # (.., seq_len_v, depth) return output, attention_weights # attention测试 # def print_out(q, k, v): # temp_out, temp_att = scaled_dot_product_attention( # q, k, v, None) # print('attention weight:') # print(temp_att) # print('output:') # print(temp_out) # np.set_printoptions(suppress=True) # temp_k = tf.constant([[10,0,0], # [0,10,0], # [0,0,10], # [0,0,10]], dtype=tf.float32) # (4, 3) # # temp_v = tf.constant([[ 1,0], # [ 10,0], # [ 100,5], # [1000,6]], dtype=tf.float32) # (4, 3) # # 关注第2个key, 返回对应的value # temp_q = tf.constant([[0,10,0]], dtype=tf.float32) # print_out(temp_q, temp_k, temp_v) # 构造mutil head attention层 class MutilHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads): super(MutilHeadAttention, self).__init__() self.num_heads = num_heads self.d_model = d_model # d_model 必须可以正确分为各个头 assert d_model % num_heads == 0 # 分头后的维度 self.depth = d_model // num_heads self.wq = tf.keras.layers.Dense(d_model) self.wk = tf.keras.layers.Dense(d_model) self.wv = tf.keras.layers.Dense(d_model) self.dense = tf.keras.layers.Dense(d_model) def split_heads(self, x, batch_size): # 分头, 将头个数的维度 放到 seq_len 前面 x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, v, k, q, mask): batch_size = tf.shape(q)[0] # 分头前的前向网络,获取q、k、v语义 # print('or_q:',q.shape) q = self.wq(q) # (batch_size, seq_len, d_model) # print('or_q:', q.shape) # sys.exit(2) k = self.wk(k) v = self.wv(v) # 分头 q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) # scaled_attention.shape == (batch_size, num_heads, seq_len_v, depth) # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) # 通过缩放点积注意力层 scaled_attention, attention_weights = scaled_dot_product_attention( q, k, v, mask) # 把多头维度后移 scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3]) # (batch_size, seq_len_v, num_heads, depth) # 合并多头 concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # 全连接重塑 output = self.dense(concat_attention) return output, attention_weights # temp_mha = MutilHeadAttention(d_model=512, num_heads=8) # # y = tf.random.uniform((1, 60, 512)) # y = tf.random.uniform((2, 60, 512)) # # y = tf.random.uniform((1, 2,60, 32)) # output, att = temp_mha(y, k=y, q=y, mask=None) # print('x:{}'.format(y.shape)) # print("out:{},att:{}".format(output.shape, att.shape)) def point_wise_feed_forward_network(d_model, diff): return tf.keras.Sequential([ tf.keras.layers.Dense(diff, activation='relu'), tf.keras.layers.Dense(d_model) ]) class LayerNormalization(tf.keras.layers.Layer): def __init__(self, epsilon=1e-6, **kwargs): self.eps = epsilon super(LayerNormalization, self).__init__(**kwargs) def build(self, input_shape): self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:], initializer=tf.ones_initializer(), trainable=True) self.beta = self.add_weight(name='beta', shape=input_shape[-1:], initializer=tf.zeros_initializer(), trainable=True) super(LayerNormalization, self).build(input_shape) def call(self, x): mean = tf.keras.backend.mean(x, axis=-1, keepdims=True) std = tf.keras.backend.std(x, axis=-1, keepdims=True) return self.gamma * (x - mean) / (std + self.eps) + self.beta def compute_output_shape(self, input_shape): return input_shape class EncoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, n_heads, ddf, dropout_rate=0.1): super(EncoderLayer, self).__init__() self.mha = MutilHeadAttention(d_model, n_heads) # return (output, attention_weights) self.ffn = point_wise_feed_forward_network(d_model, ddf) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(dropout_rate) self.dropout2 = tf.keras.layers.Dropout(dropout_rate) ########################################################### 可配一个大残差链接 ########################################################## def call(self, inputs, training, mask): # 多头注意力网络 att_output, _ = self.mha(inputs, inputs, inputs, mask) # return (output, attention_weights) att_output = self.dropout1(att_output, training=training) out1 = self.layernorm1(inputs + att_output) # (batch_size, input_seq_len, d_model) # 前向网络 ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model) return out2 # sample_encoder_layer = EncoderLayer(512, 8, 2048) #(d_model, n_heads, ddf, dropout_rate=0.1) # sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None) # print('sample_encoder_layer_output.shape:',sample_encoder_layer_output.shape) class DecoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, drop_rate=0.1): super(DecoderLayer, self).__init__() self.mha1 = MutilHeadAttention(d_model, num_heads) self.mha2 = MutilHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.layernorm3 = LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(drop_rate) self.dropout2 = layers.Dropout(drop_rate) self.dropout3 = layers.Dropout(drop_rate) def call(self, inputs, encode_out, training, look_ahead_mask, padding_mask): # masked muti-head attention att1, att_weight1 = self.mha1(inputs, inputs, inputs, look_ahead_mask) #(v, k, q) att1 = self.dropout1(att1, training=training) out1 = self.layernorm1(inputs + att1) # muti-head attention ########################################################### 此处inputs 可换为out1 ########################################################## # att2, att_weight2 = self.mha2(encode_out, encode_out, inputs, padding_mask) #(v, k, q) att2, att_weight2 = self.mha2(encode_out, encode_out, out1, padding_mask) # (v, k, q) att2 = self.dropout2(att2, training=training) out2 = self.layernorm2(out1 + att2) ffn_out = self.ffn(out2) ffn_out = self.dropout3(ffn_out, training=training) out3 = self.layernorm3(out2 + ffn_out) return out3, att_weight1, att_weight2 # sample_encoder_layer = EncoderLayer(512, 8, 2048) #(d_model, n_heads, ddf, dropout_rate=0.1) # sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None) # sample_decoder_layer = DecoderLayer(512, 8, 2048) # (d_model, num_heads, dff, drop_rate=0.1) # sample_decoder_layer_output, _, _ = sample_decoder_layer( # tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,False, None, None) # print('sample_decoder_layer_output.shape:',sample_decoder_layer_output.shape) class Encoder(layers.Layer): # def __init__(self, n_layers, d_model, n_heads, ddf, # input_vocab_size, max_seq_len, drop_rate=0.1): def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1): super(Encoder, self).__init__() self.n_layers = n_layers self.d_model = d_model # self.embedding = layers.Embedding(input_vocab_size, d_model) self.embedding = layers.Dense(d_model,activation='relu') # self.pos_embedding = positional_encoding(max_seq_len, d_model) self.encode_layer = [EncoderLayer(d_model, n_heads, ddf, drop_rate) for _ in range(n_layers)] self.dropout = layers.Dropout(drop_rate) def call(self, inputs, training, mark): seq_len = inputs.shape[1] word_emb = self.embedding(inputs) word_emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # emb = word_emb + self.pos_embedding[:, :seq_len, :] emb = word_emb + positional_encoding(seq_len, self.d_model) x = self.dropout(emb, training=training) for i in range(self.n_layers): x = self.encode_layer[i](x, training, mark) return x # sample_encoder = Encoder(2, 512, 8, 1024, 180) # (n_layers, d_model, n_heads, ddf, max_seq_len, drop_rate=0.1) # print('此处') # sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None) # print('sample_encoder_output.shape:',sample_encoder_output.shape) # sys.exit(2) class Decoder(layers.Layer): def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1): super(Decoder, self).__init__() self.d_model = d_model self.n_layers = n_layers # self.embedding = layers.Embedding(target_vocab_size, d_model) self.embedding = layers.Dense(d_model, activation='relu') # self.pos_embedding = positional_encoding(max_seq_len, d_model) self.decoder_layers = [DecoderLayer(d_model, n_heads, ddf, drop_rate) for _ in range(n_layers)] self.dropout = layers.Dropout(drop_rate) def call(self, inputs, encoder_out, training, look_ahead_mark, padding_mark): # seq_len = tf.shape(inputs)[1] seq_len = inputs.shape[1] attention_weights = {} h = self.embedding(inputs) h *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # h += self.pos_embedding[:, :seq_len, :] h += positional_encoding(seq_len, self.d_model) h = self.dropout(h, training=training) # print('--------------------\n',h, h.shape) # 叠加解码层 for i in range(self.n_layers): h, att_w1, att_w2 = self.decoder_layers[i](h, encoder_out, training, look_ahead_mark, padding_mark) attention_weights['decoder_layer{}_att_w1'.format(i + 1)] = att_w1 attention_weights['decoder_layer{}_att_w2'.format(i + 1)] = att_w2 return h, attention_weights # sample_encoder = Encoder(2, 512, 8, 1024) # (n_layers, d_model, n_heads, ddf, drop_rate=0.1) # sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None) # # print('sample_encoder_output.shape:',sample_encoder_output.shape) # sample_decoder = Decoder(2, 512,8,1024) # (n_layers, d_model, n_heads, ddf, drop_rate=0.1) # sample_decoder_output, attn = sample_decoder(tf.random.uniform((64, 180,100)),sample_encoder_output, False, None, None) # print(sample_decoder_output.shape, attn['decoder_layer1_att_w1'].shape) # sys.exit(2) class Transformer(tf.keras.Model): def __init__(self, n_layers, d_model, n_heads, diff, target_vocab_size, drop_rate=0.1): super(Transformer, self).__init__() # self.bn1 = layers.BatchNormalization() self.encoder = Encoder(n_layers, d_model, n_heads, diff,drop_rate) self.decoder = Decoder(n_layers, d_model, n_heads, diff, drop_rate) self.bn = tf.keras.layers.BatchNormalization() self.final_layer = tf.keras.layers.Dense(target_vocab_size) # self.final_layer = tf.keras.layers.Dense(target_vocab_size,activation='tanh') # def call(self, inputs, targets, training, encode_padding_mask, # look_ahead_mask, decode_padding_mask): def call(self, inputs, targets, training,look_ahead_mask = None, encode_padding_mask = None, decode_padding_mask = None): # inputs = self.bn1(inputs) encode_out = self.encoder(inputs, training, encode_padding_mask) # print(encode_out.shape) decode_out, att_weights = self.decoder(targets, encode_out, training, look_ahead_mask, decode_padding_mask) # print('decode_out.shape:',decode_out.shape) decode_out = self.bn(decode_out) final_out = self.final_layer(decode_out) # final_out = self.final_layer(decode_out) *10 return final_out, att_weights # sample_transformer = Transformer(n_layers=2, d_model=512, n_heads=8, diff=1024,target_vocab_size=20) # temp_input = tf.random.uniform((64,180, 62)) # temp_target = tf.random.uniform((64, 180,26)) # fn_out, _ = sample_transformer(temp_input, temp_target, training=False, # encode_padding_mask=None, # look_ahead_mask=None, # decode_padding_mask=None, # ) # print('fn_out.shape:',fn_out.shape) # @tf.function global_num = 0 global_train_acc = 0 def train_step(inputs, targets): # print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^6") global global_num global global_train_acc global_num +=1 # tar_inp = targets[:, :-1,:6] # tar_real = targets[:, 1:,:6] tar_inp = targets[:, :-1, 5][:,:,np.newaxis] tar_real = targets[:, 1:, 5][:,:,np.newaxis] # print() with tf.GradientTape() as tape: predictions, _ = transformer(inputs, tar_inp,training = True,look_ahead_mask=look_mask,decode_padding_mask = None) # print("pre:{}".format(predictions[:2,:5])) # print("True:{}".format(tar_real[:2,:5])) # predictions, _ = transformer(inputs, tar_inp, training=True, look_ahead_mask=None,decode_padding_mask=None) # loss = loss_fun(tar_real, predictions,loss_mask) loss = loss_fun(tar_real, predictions) if global_num % 10 == 0: acc = get_acc(predictions,tar_real) global_train_acc = acc.numpy() # print('train_acc:{:.2f}'.format(acc.numpy())) # 求梯度 gradients = tape.gradient(loss, transformer.trainable_variables) # 反向传播 optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) mse_loss = tf.reduce_mean(tf.losses.MSE(predictions,tar_real)) train_loss(mse_loss)