transformer 源码
训练时:
1. 输入正确标签一次性解码出来
预测时:
1. 第一次输入1个词,解码出一个词
第二次输入第一次输入的词和第一次解码出来词一起,解码出来第3个词,这样依次解码,解码到最长的长度或者<pad>。就结束。
训练时,全部输入与预测时一个一个输入是一样的
1. 需要传入词向量
def __init__(self, hp): self.hp = hp self.token2idx, self.idx2token = load_vocab(hp.vocab) # 这里在实际的需求情况下传入自己的词典 self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) # 这里作者使用定义的变量训练的词向量,在实际的生产过程当中,我们可以使用word2vec、bert等
2.position_encoding
def positional_encoding(inputs, num_units, zero_pad=True, scale=True, scope="positional_encoding", reuse=None): '''Sinusoidal Positional_Encoding. Args: inputs: A 2d Tensor with shape of (N, T). num_units: Output dimensionality zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' ''' N, T = inputs.get_shape().as_list() with tf.variable_scope(scope, reuse=reuse): position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) # First part of the PE function: sin and cos argument position_enc = np.array([ [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] for pos in range(T)]) # Second part, apply the cosine to even columns and sin to odds. position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 # Convert to a tensor lookup_table = tf.convert_to_tensor(position_enc) if zero_pad: lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) outputs = tf.nn.embedding_lookup(lookup_table, position_ind) if scale: outputs = outputs * num_units**0.5 return outputs
3. multihead_attention
def multihead_attention(queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False, scope="multihead_attention", reuse=None): '''Applies multihead attention. Args: queries: A 3d tensor with shape of [N, T_q, C_q]. keys: A 3d tensor with shape of [N, T_k, C_k]. num_units: A scalar. Attention size. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. causality: Boolean. If true, units that reference the future are masked. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) ''' with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units if num_units is None: num_units = queries.get_shape().as_list()[-1] # Linear projections Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) C为num_units,本实现中未设定,故等于C_q K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) # Key Masking key_masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, -T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs)*(-2**32+1) b = tf.equal(key_masks, 0) """ 然后定义一个和outputs同shape的paddings,该tensor每个值都设定的极小。用where函数比较,当对应位置的key_masks值为0也就是需要mask时, outputs的该值(attention score)设置为极小的值(利用paddings实现),否则保留原来的outputs值。 经过以上key mask操作之后outputs的shape仍为 (h*N, T_q, T_k),只是对应mask了的key的score变为很小的值。 """ outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: # 是否忽略未来信息 diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks)*(-2**32+1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking query_masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (N, T_q, T_k)?注释有误,将C改成T_k # Dropouts outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) # Residual connection outputs += queries # Normalize outputs = normalize(outputs) # (N, T_q, C) return outputs
4. feedforward
def feedforward(inputs, num_units=[2048, 512], scope="multihead_attention", reuse=None): '''Point-wise feed forward net. Args: inputs: A 3d tensor with shape of [N, T, C]. num_units: A list of two integers. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A 3d tensor with the same shape and dtype as inputs ''' with tf.variable_scope(scope, reuse=reuse): # Inner layer params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu, "use_bias": True} outputs = tf.layers.conv1d(**params) # Readout layer params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True} outputs = tf.layers.conv1d(**params) # Residual connection outputs += inputs # Normalize outputs = normalize(outputs) return outputs
5.normalize
def normalize(inputs, epsilon = 1e-8, scope="ln", reuse=None): '''Applies layer normalization. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A tensor with the same shape and data dtype as `inputs`. ''' with tf.variable_scope(scope, reuse=reuse): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta= tf.Variable(tf.zeros(params_shape)) gamma = tf.Variable(tf.ones(params_shape)) normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) outputs = gamma * normalized + beta return outputs
6. encoder-decoder
with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") # key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.enc += tf.cast(positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe"), tf.float32) else: self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") # self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention(queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") self.dec_ = self.dec # key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += tf.cast(positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe"), tf.float32) else: self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention(queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention(queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
# Final linear projection
self.logits = tf.layers.dense(self.dec, len(en2idx))
self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
self.istarget = tf.to_float(tf.not_equal(self.y, 0))
self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget) / (tf.reduce_sum(self.istarget))
tf.summary.scalar('acc', self.acc)
7. train
if is_training: # Loss self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()