对抗训练简述&应用

本文简述对抗训练在图像的的实践方法，关于对抗训练的理论和NLP应用请参阅参考文献【1】。

可以采用两种方法：

在原始样本训练模型，在对抗样本微调
混合正常样本和对抗样本一起训练

完成模型训练后，构造对抗样本

from tensorflow.keras.losses import MSE
import tensorflow as tf

def generate_image_adversary(model, image, label, eps=2 / 255.0):
    # cast the image
    image = tf.cast(image, tf.float32)
    # record our gradients
    with tf.GradientTape() as tape:
        # explicitly indicate that our image should be tacked for
        # gradient updates
        tape.watch(image)
        # use our model to make predictions on the input image and
        # then compute the loss
        pred = model(image)
        loss = tf.keras.losses.MSE(label, pred)
    # calculate the gradients of loss with respect to the image, then
    # compute the sign of the gradient
    gradient = tape.gradient(loss, image)
    signedGrad = tf.sign(gradient)
    # construct the image adversary
    adversary = (image + (signedGrad * eps)).numpy()
    # return the image adversary to the calling function
    return adversary

def generate_adversarial_batch(model, dataset, eps=0.01):
    while True:
        for images, labels in dataset:
            adversary = generate_image_adversary(model,images, labels, eps=eps)
            yield adversary,labels

混合样本，一起训练

同上述方法，只要在构造批样本中混合正常和对抗样本即可

train

adv_ds = generate_adversarial_batch(model, train_ds, eps=0.1)

@tf.function
def train_loop(features, labels, training=False):
    # Define the GradientTape context
    with tf.GradientTape() as tape:
        # Get the probabilities
        predictions = model(features, training)
        #labels = tf.dtypes.cast(labels, tf.float32)
        # Calculate the loss
        loss = loss_func(labels, predictions)
    # Get the gradients
    gradients = tape.gradient(loss, model.trainable_variables)
    # Update the weights
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss, predictions

loss_func = tf.keras.losses.BinaryCrossentropy()
accuracy = tf.keras.metrics.BinaryAccuracy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
epoch_loss_avg = tf.keras.metrics.Mean()

for step, (x, y) in enumerate(adv_ds):
    #print("Input: {}".format(x))

    loss, y_ = train_loop(x, y, True)

    # Track progress
    epoch_loss_avg(loss)
    accuracy(y, y_)

    if step % 100 == 0:
        print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}".format(step,
                                                                      epoch_loss_avg.result(),
                                                                      accuracy.result()))