tf2.0 实现transformer小改

import sys

# import keras
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
import os
import matplotlib.pyplot as plt


#   设置相关底层配置
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

def my_padding(x,size):
    result = np.array([])
    bat_size = x.shape[0]
    for i in range(bat_size):
        zero_mat = np.zeros((size - x.shape[1],x.shape[2]))
        start_mat = np.ones((1,x.shape[2])) * 0.3
        end_mat = np.ones((1,x.shape[2])) * 0.3
        each_x = x[i]

        each_x = np.vstack([start_mat,each_x])
        pad_x = np.vstack([each_x,zero_mat])
        pad_x = np.vstack([pad_x,end_mat])
        pad_x = pad_x.reshape(1, size+2, x.shape[2])
        if result.shape[0] == 0:
            result = pad_x
        else:
            result = np.vstack([result,pad_x])

    return result
# test_x = np.array([
#     [[111, 112, 113],
#      [121, 122, 123],
#      ],
#     [[211, 212, 213],
#      [221, 222, 223],
#      ],
#     [[311, 312, 313],
#      [321, 322, 323],
#      ],
# ])
# a = my_padding(test_x,6)
# print(a.shape)
# print(a)
# sys.exit(2)

def get_angles(pos, i, d_model):
    # 这里的i等价与上面公式中的2i和2i+1
    angle_rates = 1 / np.power(10000, (2*(i // 2))/ np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    # print('position:',position)

    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # 第2i项使用sin
    sines = np.sin(angle_rads[:, 0::2])
    # 第2i+1项使用cos
    cones = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cones], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)
# pos_encoding = positional_encoding(50, 512)

# def create_padding_mark(seq):
#     # 获取为0的padding项
#     # print('seq:', seq)
#     seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
#     # 扩充维度以便用于attention矩阵
#     return seq[:, np.newaxis, np.newaxis, :]  # (batch_size,1,1,seq_len)
#     # print('seq_mask:',seq)
#     return seq  # (batch_size,seq_len)
def create_padding_mark(targets):
    # 获取为0的padding项
    # print('targets:',targets.shape)
    zero_mask = np.max(targets,axis=-1)
    zero_mask = tf.cast(tf.math.equal(zero_mask, 0), tf.float32)
    zero_mask = tf.reshape(zero_mask,(-1,1))
    one_mat = np.ones((targets.shape[0],targets.shape[0]))
    # print('zero_mask:',zero_mask.shape,zero_mask)
    result_mask1 = np.multiply(one_mat,zero_mask)
    result_mask2 = np.multiply(one_mat, tf.transpose(zero_mask))
    result_mask = tf.maximum(result_mask1,result_mask2)
    # print('result_mask:', result_mask.shape, result_mask)
    # sys.exit(2)
    return result_mask  # (batch_size,seq_len)
# mark 测试
# create_padding_mark([[1,2,0,0,3],[3,4,5,0,0],[0,0,0,0,0]])
# sys.exit(2)
def create_look_ahead_mark(size):
    # print('size:',size)
    # 1 - 对角线和取下三角的全部对角线(-1->全部)
    # 这样就可以构造出每个时刻未预测token的掩码
    mark = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)        #   (a, numLower, numUpper)
    # print('mark.shapeL:',mark.shape)
    # sys.exit(2)
    # mark = np.reshape(mark,())
    mark = tf.cast(mark,tf.float32)
    return mark  # (seq_len, seq_len)
# temp = create_look_ahead_mark(3)
# print(temp)
# sys.exit(2)
# 构建掩码
def create_mask(targets):

    # loss_mark = np.max(targets,axis=-1)
    # loss_mark = tf.cast(tf.math.equal(loss_mark, 0), tf.float32)
    # loss_mark = tf.reshape(loss_mark,(-1,1))
    # # print('loss_mark:',loss_mark,loss_mark.shape)
    # decode_targets_padding_mask = create_padding_mark(targets)
    # decode_targets_padding_mask = tf.cast(decode_targets_padding_mask,tf.float32)
    # look_ahead 掩码, 掩掉未预测的词
    look_ahead_mask = create_look_ahead_mark(targets.shape[0])
    look_ahead_mask  = tf.cast(look_ahead_mask , tf.float32)
    # print('此粗话'k:', look_ahead_mask.shape,type(look_ahead_mask.shape))
    # 合并解码层第一层掩码
    # combine_mask = tf.maximum(decode_targets_padding_mask, look_ahead_mask)
    # print('combine_m
    # return combine_mask, loss_mark
    return look_ahead_mask

# sys.exit(2)

def scaled_dot_product_attention(q, k, v, mask):
    # query key 相乘获取匹配关系
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    # 使用dk进行缩放
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # 掩码
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # 通过softmax获取attention权重
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    # attention 乘上value
    output = tf.matmul(attention_weights, v)  # (.., seq_len_v, depth)

    return output, attention_weights
#   attention测试
# def print_out(q, k, v):
#     temp_out, temp_att = scaled_dot_product_attention(
#     q, k, v, None)
#     print('attention weight:')
#     print(temp_att)
#     print('output:')
#     print(temp_out)
# np.set_printoptions(suppress=True)
# temp_k = tf.constant([[10,0,0],
#                       [0,10,0],
#                       [0,0,10],
#                       [0,0,10]], dtype=tf.float32)  # (4, 3)
#
# temp_v = tf.constant([[   1,0],
#                       [  10,0],
#                       [ 100,5],
#                       [1000,6]], dtype=tf.float32)  # (4, 3)
# # 关注第2个key, 返回对应的value
# temp_q = tf.constant([[0,10,0]], dtype=tf.float32)
# print_out(temp_q, temp_k, temp_v)


# 构造mutil head attention层
class MutilHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MutilHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        # d_model 必须可以正确分为各个头
        assert d_model % num_heads == 0
        # 分头后的维度
        self.depth = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # 分头, 将头个数的维度 放到 seq_len 前面
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        # 分头前的前向网络,获取q、k、v语义
        # print('or_q:',q.shape)
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        # print('or_q:', q.shape)
        # sys.exit(2)
        k = self.wk(k)
        v = self.wv(v)

        # 分头
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        # scaled_attention.shape == (batch_size, num_heads, seq_len_v, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)

        # 通过缩放点积注意力层
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)
        # 把多头维度后移
        scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3])  # (batch_size, seq_len_v, num_heads, depth)

        # 合并多头
        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))

        # 全连接重塑
        output = self.dense(concat_attention)
        return output, attention_weights
# temp_mha = MutilHeadAttention(d_model=512, num_heads=8)
# # y = tf.random.uniform((1, 60, 512))
# y = tf.random.uniform((2, 60, 512))
# # y = tf.random.uniform((1, 2,60, 32))
# output, att = temp_mha(y, k=y, q=y, mask=None)
# print('x:{}'.format(y.shape))
# print("out:{},att:{}".format(output.shape, att.shape))

def point_wise_feed_forward_network(d_model, diff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(diff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
        self.eps = epsilon
        super(LayerNormalization, self).__init__(**kwargs)
    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                     initializer=tf.ones_initializer(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                    initializer=tf.zeros_initializer(), trainable=True)
        super(LayerNormalization, self).build(input_shape)
    def call(self, x):
        mean = tf.keras.backend.mean(x, axis=-1, keepdims=True)
        std = tf.keras.backend.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta
    def compute_output_shape(self, input_shape):
        return input_shape

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, ddf, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MutilHeadAttention(d_model, n_heads)             #   return (output, attention_weights)
        self.ffn = point_wise_feed_forward_network(d_model, ddf)

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    ###########################################################     可配一个大残差链接     ##########################################################
    def call(self, inputs, training, mask):
        # 多头注意力网络
        att_output, _ = self.mha(inputs, inputs, inputs, mask)      #   return (output, attention_weights)
        att_output = self.dropout1(att_output, training=training)
        out1 = self.layernorm1(inputs + att_output)  # (batch_size, input_seq_len, d_model)
        # 前向网络
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
        return out2
# sample_encoder_layer = EncoderLayer(512, 8, 2048)   #(d_model, n_heads, ddf, dropout_rate=0.1)
# sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
# print('sample_encoder_layer_output.shape:',sample_encoder_layer_output.shape)

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, drop_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MutilHeadAttention(d_model, num_heads)
        self.mha2 = MutilHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(drop_rate)
        self.dropout2 = layers.Dropout(drop_rate)
        self.dropout3 = layers.Dropout(drop_rate)

    def call(self, inputs, encode_out, training,
             look_ahead_mask, padding_mask):
        # masked muti-head attention
        att1, att_weight1 = self.mha1(inputs, inputs, inputs, look_ahead_mask)     #(v, k, q)
        att1 = self.dropout1(att1, training=training)
        out1 = self.layernorm1(inputs + att1)

        # muti-head attention
        ###########################################################   此处inputs 可换为out1 ##########################################################
        # att2, att_weight2 = self.mha2(encode_out, encode_out, inputs, padding_mask)     #(v, k, q)
        att2, att_weight2 = self.mha2(encode_out, encode_out, out1, padding_mask)  # (v, k, q)
        att2 = self.dropout2(att2, training=training)
        out2 = self.layernorm2(out1 + att2)

        ffn_out = self.ffn(out2)
        ffn_out = self.dropout3(ffn_out, training=training)
        out3 = self.layernorm3(out2 + ffn_out)

        return out3, att_weight1, att_weight2
# sample_encoder_layer = EncoderLayer(512, 8, 2048)   #(d_model, n_heads, ddf, dropout_rate=0.1)
# sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
# sample_decoder_layer = DecoderLayer(512, 8, 2048)   #   (d_model, num_heads, dff, drop_rate=0.1)
# sample_decoder_layer_output, _, _ = sample_decoder_layer(
# tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,False, None, None)
# print('sample_decoder_layer_output.shape:',sample_decoder_layer_output.shape)
class Encoder(layers.Layer):
    # def __init__(self, n_layers, d_model, n_heads, ddf,
    #              input_vocab_size, max_seq_len, drop_rate=0.1):
    def __init__(self, n_layers, d_model, n_heads, ddf,  drop_rate=0.1):
        super(Encoder, self).__init__()

        self.n_layers = n_layers
        self.d_model = d_model

        # self.embedding = layers.Embedding(input_vocab_size, d_model)
        self.embedding = layers.Dense(d_model,activation='relu')
        # self.pos_embedding = positional_encoding(max_seq_len, d_model)

        self.encode_layer = [EncoderLayer(d_model, n_heads, ddf, drop_rate)
                             for _ in range(n_layers)]

        self.dropout = layers.Dropout(drop_rate)

    def call(self, inputs, training, mark):
        seq_len = inputs.shape[1]
        word_emb = self.embedding(inputs)
        word_emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        # emb = word_emb + self.pos_embedding[:, :seq_len, :]
        emb = word_emb + positional_encoding(seq_len, self.d_model)
        x = self.dropout(emb, training=training)
        for i in range(self.n_layers):
            x = self.encode_layer[i](x, training, mark)

        return x
# sample_encoder = Encoder(2, 512, 8, 1024, 180)      # (n_layers, d_model, n_heads, ddf, max_seq_len, drop_rate=0.1)
# print('此处')
# sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
# print('sample_encoder_output.shape:',sample_encoder_output.shape)
# sys.exit(2)

class Decoder(layers.Layer):
    def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.n_layers = n_layers

        # self.embedding = layers.Embedding(target_vocab_size, d_model)
        self.embedding = layers.Dense(d_model, activation='relu')
        # self.pos_embedding = positional_encoding(max_seq_len, d_model)

        self.decoder_layers = [DecoderLayer(d_model, n_heads, ddf, drop_rate)
                               for _ in range(n_layers)]

        self.dropout = layers.Dropout(drop_rate)

    def call(self, inputs, encoder_out, training,
             look_ahead_mark, padding_mark):
        # seq_len = tf.shape(inputs)[1]
        seq_len = inputs.shape[1]
        attention_weights = {}
        h = self.embedding(inputs)
        h *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        # h += self.pos_embedding[:, :seq_len, :]
        h += positional_encoding(seq_len, self.d_model)

        h = self.dropout(h, training=training)
        #         print('--------------------\n',h, h.shape)
        # 叠加解码层
        for i in range(self.n_layers):
            h, att_w1, att_w2 = self.decoder_layers[i](h, encoder_out,
                                                       training, look_ahead_mark,
                                                       padding_mark)
            attention_weights['decoder_layer{}_att_w1'.format(i + 1)] = att_w1
            attention_weights['decoder_layer{}_att_w2'.format(i + 1)] = att_w2

        return h, attention_weights
# sample_encoder = Encoder(2, 512, 8, 1024)      # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
# sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
# # print('sample_encoder_output.shape:',sample_encoder_output.shape)
# sample_decoder = Decoder(2, 512,8,1024)       # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
# sample_decoder_output, attn = sample_decoder(tf.random.uniform((64, 180,100)),sample_encoder_output, False, None, None)
# print(sample_decoder_output.shape, attn['decoder_layer1_att_w1'].shape)
# sys.exit(2)

class Transformer(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, diff, target_vocab_size, drop_rate=0.1):
        super(Transformer, self).__init__()
        # self.bn1 = layers.BatchNormalization()
        self.encoder = Encoder(n_layers, d_model, n_heads, diff,drop_rate)
        self.decoder = Decoder(n_layers, d_model, n_heads, diff, drop_rate)
        self.bn = tf.keras.layers.BatchNormalization()
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        # self.final_layer = tf.keras.layers.Dense(target_vocab_size,activation='tanh')

    # def call(self, inputs, targets, training, encode_padding_mask,
    #          look_ahead_mask, decode_padding_mask):
    def call(self, inputs, targets, training,look_ahead_mask = None, encode_padding_mask = None,
              decode_padding_mask = None):
        # inputs = self.bn1(inputs)
        encode_out = self.encoder(inputs, training, encode_padding_mask)
        # print(encode_out.shape)
        decode_out, att_weights = self.decoder(targets, encode_out, training,
                                               look_ahead_mask, decode_padding_mask)
        # print('decode_out.shape:',decode_out.shape)
        decode_out = self.bn(decode_out)
        final_out = self.final_layer(decode_out)
        # final_out = self.final_layer(decode_out) *10

        return final_out, att_weights
# sample_transformer = Transformer(n_layers=2, d_model=512, n_heads=8, diff=1024,target_vocab_size=20)
# temp_input = tf.random.uniform((64,180, 62))
# temp_target = tf.random.uniform((64, 180,26))
# fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
#                               encode_padding_mask=None,
#                                look_ahead_mask=None,
#                                decode_padding_mask=None,
#                               )
# print('fn_out.shape:',fn_out.shape)


# @tf.function

global_num = 0
global_train_acc = 0

def train_step(inputs, targets):
    # print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^6")
    global global_num
    global global_train_acc
    global_num +=1
    # tar_inp = targets[:, :-1,:6]
    # tar_real = targets[:, 1:,:6]
    tar_inp = targets[:, :-1, 5][:,:,np.newaxis]
    tar_real = targets[:, 1:, 5][:,:,np.newaxis]
    # print()
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inputs, tar_inp,training = True,look_ahead_mask=look_mask,decode_padding_mask = None)
        # print("pre:{}".format(predictions[:2,:5]))
        # print("True:{}".format(tar_real[:2,:5]))
        # predictions, _ = transformer(inputs, tar_inp, training=True, look_ahead_mask=None,decode_padding_mask=None)
        # loss = loss_fun(tar_real, predictions,loss_mask)
        loss = loss_fun(tar_real, predictions)
    if global_num % 10 == 0:
        acc = get_acc(predictions,tar_real)
        global_train_acc = acc.numpy()
        # print('train_acc:{:.2f}'.format(acc.numpy()))

    # 求梯度
    gradients = tape.gradient(loss, transformer.trainable_variables)
    # 反向传播
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    mse_loss = tf.reduce_mean(tf.losses.MSE(predictions,tar_real))
    train_loss(mse_loss)

 

posted @ 2022-01-30 15:56  山…隹  阅读(195)  评论(0编辑  收藏  举报