使用差分学习率
模型层间差分学习率
一般在微调Bert等预训练模型时使用,可以配合warmup,见warmup小记。
tensorflow
几行搞定,参考官方文档[1]
import tensorflow_addons as tfa
optimizers = [
tf.keras.optimizers.Adam(learning_rate=1e-4),
tf.keras.optimizers.Adam(learning_rate=1e-2)
]
optimizers_and_layers = [(optimizers[0], model.layers[0]), (optimizers[1], model.layers[1:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
pytorch
几种不同的设置方法
def get_parameters(model, model_init_lr, multiplier, classifier_lr):
parameters = []
lr = model_init_lr
for layer in range(12,-1,-1):
layer_params = {
'params': [p for n,p in model.named_parameters() if f'encoder.layer.{layer}.' in n],
'lr': lr
}
parameters.append(layer_params)
lr *= multiplier
classifier_params = {
'params': [p for n,p in model.named_parameters() if 'layer_norm' in n or 'linear' in n
or 'pooling' in n],
'lr': classifier_lr
}
parameters.append(classifier_params)
return parameters
parameters=get_parameters(model,2e-5,0.95, 1e-4)
optimizer=AdamW(parameters)
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
],
"weight_decay": 0.01,
},
{
"params": [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.net1 = nn.Linear(2,10)
self.net2 = nn.Linear(10,1)
def forward(self, x):
x = self.net1(x)
x = self.net2(x)
return x
net = Net()
# 以字典的形式进行对不同参数组的所需要的优化参数定义
optimizer = optim.SGD([
{"params":net.net1.parameters()},
{"params":net.net1.parameters(),"lr":1e-5},],
lr=1e-2, #默认参数
)
references
【1】tfa.optimizers.MultiOptimizer. https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers/MultiOptimizer
【2】Pytorch不同层设置不同学习率. https://blog.csdn.net/qq_41554005/article/details/119898464
【3】关于训练神经网路的诸多技巧Tricks(完全总结版). https://juejin.cn/post/6844903711123193870