flax 04 MNIST的例子
这里总体的看一个MNIST的例子用来看看flax是如何工作的
导包
import jax
import jax.numpy as jnp # JAX NumPy
from flax import linen as nn # The Linen API
from flax.training import train_state # Useful dataclass to keep train state
import numpy as np # Ordinary NumPy
import optax # Optimizers
import tensorflow_datasets as tfds # TFDS for MNIST
这个没啥好说的,几个没见过的在后面看着怎么用就行了
定义网络
class CNN(nn.Module):
"""A simple CNN model."""
@nn.compact
def __call__(self, x):
x = nn.Conv(features=32, kernel_size=(3, 3))(x)
x = nn.relu(x)
x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2))
x = nn.Conv(features=64, kernel_size=(3, 3))(x)
x = nn.relu(x)
x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2))
x = x.reshape((x.shape[0], -1)) # flatten
x = nn.Dense(features=256)(x)
x = nn.relu(x)
x = nn.Dense(features=10)(x)
return x
之前的文章说过了,不再赘述
定义损失
def cross_entropy_loss(*, logits, labels):
labels_onehot = jax.nn.one_hot(labels, num_classes=10)
return optax.softmax_cross_entropy(logits=logits, labels=labels_onehot).mean()
应该也都看的懂吧,第一个参数*是为了控制让调用的函数必须以变量名声明的方式传入参数而不是位置传参,如果是在最后使用/,变成def cross_entropy_loss(logits, labels,/):
那就是只能位置传参,不能使用变量名字传参,需要python3.8
计算
def compute_metrics(*, logits, labels):
loss = cross_entropy_loss(logits=logits, labels=labels)
accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
metrics = {
'loss': loss,
'accuracy': accuracy,
}
return metrics
我也不清楚为什么不在上面的损失函数里面一步完成,不知道一步完成行不行
数据
def get_datasets():
"""Load MNIST train and test datasets into memory."""
ds_builder = tfds.builder('mnist')
ds_builder.download_and_prepare()
train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
train_ds['image'] = jnp.float32(train_ds['image']) / 255.
test_ds['image'] = jnp.float32(test_ds['image']) / 255.
return train_ds, test_ds
这个没啥好说的,TF的东西记得需要转换为jnp的数据
创建训练状态
在flax的模式中,在整个训练过程中都是使用一个dataclass来代表整个训练过程(不太懂,应该是和torch中model.train()类似的东西),包括了:step number, parameters, 和optimizer state.
如果想要添加更多的训练所要持续使用的对象便于跟踪,flax给的建议是创建一个flax.training.train_state.TrainState
的子类,如果只是使用一些基本的对象的话,那么直接使用就可以。
def create_train_state(rng, learning_rate, momentum):
"""Creates initial `TrainState`."""
cnn = CNN()
params = cnn.init(rng, jnp.ones([1, 28, 28, 1]))['params']
tx = optax.sgd(learning_rate, momentum)
return train_state.TrainState.create(
apply_fn=cnn.apply, params=params, tx=tx)
训练的过程就是对这个TrainState对象操作的过程,这个对象需要三个参数,向前传播函数,模型参数,优化器
训练
@jax.jit
def train_step(state, batch):
"""Train for a single step."""
def loss_fn(params):
logits = CNN().apply({'params': params}, batch['image'])
loss = cross_entropy_loss(logits=logits, labels=batch['label'])
return loss, logits
grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
(_, logits), grads = grad_fn(state.params)
state = state.apply_gradients(grads=grads)
metrics = compute_metrics(logits=logits, labels=batch['label'])
return state, metrics
这个函数:
- 对当前的模型进行评估
- 计算损失函数
- 计算梯度
- 使用梯度优化模型参数
同时使用jit(just-in-time)来对整个过程进行加速。
评估
@jax.jit
def eval_step(params, batch):
logits = CNN().apply({'params': params}, batch['image'])
return compute_metrics(logits=logits, labels=batch['label'])
训练函数
def train_epoch(state, train_ds, batch_size, epoch, rng):
"""Train for a single epoch."""
train_ds_size = len(train_ds['image'])
steps_per_epoch = train_ds_size // batch_size
perms = jax.random.permutation(rng, train_ds_size)
perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch
perms = perms.reshape((steps_per_epoch, batch_size))
batch_metrics = []
for perm in perms:
batch = {k: v[perm, ...] for k, v in train_ds.items()}
state, metrics = train_step(state, batch)
batch_metrics.append(metrics)
# compute mean of metrics across each batch in epoch.
batch_metrics_np = jax.device_get(batch_metrics)
epoch_metrics_np = {
k: np.mean([metrics[k] for metrics in batch_metrics_np])
for k in batch_metrics_np[0]}
print('train epoch: %d, loss: %.4f, accuracy: %.2f' % (
epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100))
return state
整个的完整代码github