Deep Learning 32: 自己写的keras的一个callbacks函数,解决keras中不能在每个epoch实时显示学习速率learning rate的问题
一.问题:
keras中不能在每个epoch实时显示学习速率learning rate,从而方便调试,实际上也是为了调试解决这个问题:Deep Learning 31: 不同版本的keras,对同样的代码,得到不同结果的原因总结
二.解决方法
1.把下面代码加入keras文件callbacks.py中:
1 class DisplayLearningRate(Callback): 2 '''Display Learning rate . 3 ''' 4 def __init__(self): 5 super(DisplayLearningRate, self).__init__() 6 7 def on_epoch_begin(self, epoch, logs={}): 8 assert hasattr(self.model.optimizer, 'lr'), \ 9 'Optimizer must have a "lr" attribute.' 10 lr_now = K.get_value(self.model.optimizer.lr) 11 12 print('Epoch %05d: Learning rate is %s' % (epoch, lr_now))
2.应用方法如下:
1 history = model.fit(X_train, 2 Y_train, 3 batch_size=batch_size, 4 nb_epoch=nb_epoch, 5 show_accuracy=False, 6 verbose=2, 7 validation_data=(X_test, Y_test), 8 callbacks = [ 9 keras.callbacks.DisplayLearningRate(), 10 keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto'), # 该回调函数将在每个epoch后保存模型到filepath 11 # keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')# 当监测值不再改善时,该回调函数将中止训练.当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 12 ])
三.总结
按照上面的方法试了之后发现,每个epoch显示的learning rate都是一样的,原来按照这样显示的是最开始初始化时的learning rate,每次epoch学习速率更新后,并没有把值赋给初始时的learning rate,所以才会这样,那么要怎么样才能实时显示每个epoch的学习速率呢? 我觉得应该是显示optimizer中的updates.
四.最终办法
1 # set the decay as 1e-1 to see the Ir change between epochs. 2 sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True) 3 model.compile(loss='categorical_crossentropy', 4 optimizer=sgd, 5 metrics=['accuracy']) 6 class LossHistory(Callback): 7 def on_epoch_begin(self, batch, logs={}): 8 lr = self.lr * (1. / (1. + self.decay * self.iterations)) 9 print('Ir:', lr) 10 history=LossHistory() 11 model.fit(X_train, Y_train, 12 batch_size= batch_size, 13 nb_epoch= nb_epoch, 14 callbacks= [history])
参考:http://stackoverflow.com/questions/40144805/print-learning-rate-evary-epoch-in-sgd
下面我分别把keras==0.3.3和1.2.0时的optimizer.py分别贴出来:
keras==0.3.3时的optimizer.py如下:
1 from __future__ import absolute_import 2 from . import backend as K 3 import numpy as np 4 from .utils.generic_utils import get_from_module 5 from six.moves import zip 6 7 8 def clip_norm(g, c, n): 9 if c > 0: 10 g = K.switch(n >= c, g * c / n, g) 11 return g 12 13 14 def kl_divergence(p, p_hat): 15 return p_hat - p + p * K.log(p / p_hat) 16 17 18 class Optimizer(object): 19 '''Abstract optimizer base class. 20 21 Note: this is the parent class of all optimizers, not an actual optimizer 22 that can be used for training models. 23 24 All Keras optimizers support the following keyword arguments: 25 26 clipnorm: float >= 0. Gradients will be clipped 27 when their L2 norm exceeds this value. 28 clipvalue: float >= 0. Gradients will be clipped 29 when their absolute value exceeds this value. 30 ''' 31 def __init__(self, **kwargs): 32 self.__dict__.update(kwargs) 33 self.updates = [] 34 35 def get_state(self): 36 return [K.get_value(u[0]) for u in self.updates] 37 38 def set_state(self, value_list): 39 assert len(self.updates) == len(value_list) 40 for u, v in zip(self.updates, value_list): 41 K.set_value(u[0], v) 42 43 def get_updates(self, params, constraints, loss): 44 raise NotImplementedError 45 46 def get_gradients(self, loss, params): 47 grads = K.gradients(loss, params) 48 if hasattr(self, 'clipnorm') and self.clipnorm > 0: 49 norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads])) 50 grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 51 if hasattr(self, 'clipvalue') and self.clipvalue > 0: 52 grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads] 53 return grads 54 55 def get_config(self): 56 return {"name": self.__class__.__name__} 57 58 59 class SGD(Optimizer): 60 '''Stochastic gradient descent, with support for momentum, 61 decay, and Nesterov momentum. 62 63 # Arguments 64 lr: float >= 0. Learning rate. 65 momentum: float >= 0. Parameter updates momentum. 66 decay: float >= 0. Learning rate decay over each update. 67 nesterov: boolean. Whether to apply Nesterov momentum. 68 ''' 69 def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, 70 *args, **kwargs): 71 super(SGD, self).__init__(**kwargs) 72 self.__dict__.update(locals()) 73 self.iterations = K.variable(0.) 74 self.lr = K.variable(lr) 75 self.momentum = K.variable(momentum) 76 self.decay = K.variable(decay) 77 78 def get_updates(self, params, constraints, loss): 79 grads = self.get_gradients(loss, params) 80 lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) 81 self.updates = [(self.iterations, self.iterations + 1.)] 82 83 for p, g, c in zip(params, grads, constraints): 84 m = K.variable(np.zeros(K.get_value(p).shape)) # momentum 85 v = self.momentum * m - lr * g # velocity 86 self.updates.append((m, v)) 87 88 if self.nesterov: 89 new_p = p + self.momentum * v - lr * g 90 else: 91 new_p = p + v 92 93 self.updates.append((p, c(new_p))) # apply constraints 94 return self.updates 95 96 def get_config(self): 97 return {"name": self.__class__.__name__, 98 "lr": float(K.get_value(self.lr)), 99 "momentum": float(K.get_value(self.momentum)), 100 "decay": float(K.get_value(self.decay)), 101 "nesterov": self.nesterov} 102 103 104 class RMSprop(Optimizer): 105 '''RMSProp optimizer. 106 107 It is recommended to leave the parameters of this optimizer 108 at their default values. 109 110 This optimizer is usually a good choice for recurrent 111 neural networks. 112 113 # Arguments 114 lr: float >= 0. Learning rate. 115 rho: float >= 0. 116 epsilon: float >= 0. Fuzz factor. 117 ''' 118 def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs): 119 super(RMSprop, self).__init__(**kwargs) 120 self.__dict__.update(locals()) 121 self.lr = K.variable(lr) 122 self.rho = K.variable(rho) 123 124 def get_updates(self, params, constraints, loss): 125 grads = self.get_gradients(loss, params) 126 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 127 self.updates = [] 128 129 for p, g, a, c in zip(params, grads, accumulators, constraints): 130 # update accumulator 131 new_a = self.rho * a + (1 - self.rho) * K.square(g) 132 self.updates.append((a, new_a)) 133 134 new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon) 135 self.updates.append((p, c(new_p))) # apply constraints 136 return self.updates 137 138 def get_config(self): 139 return {"name": self.__class__.__name__, 140 "lr": float(K.get_value(self.lr)), 141 "rho": float(K.get_value(self.rho)), 142 "epsilon": self.epsilon} 143 144 145 class Adagrad(Optimizer): 146 '''Adagrad optimizer. 147 148 It is recommended to leave the parameters of this optimizer 149 at their default values. 150 151 # Arguments 152 lr: float >= 0. Learning rate. 153 epsilon: float >= 0. 154 ''' 155 def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs): 156 super(Adagrad, self).__init__(**kwargs) 157 self.__dict__.update(locals()) 158 self.lr = K.variable(lr) 159 160 def get_updates(self, params, constraints, loss): 161 grads = self.get_gradients(loss, params) 162 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 163 self.updates = [] 164 165 for p, g, a, c in zip(params, grads, accumulators, constraints): 166 new_a = a + K.square(g) # update accumulator 167 self.updates.append((a, new_a)) 168 new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon) 169 self.updates.append((p, c(new_p))) # apply constraints 170 return self.updates 171 172 def get_config(self): 173 return {"name": self.__class__.__name__, 174 "lr": float(K.get_value(self.lr)), 175 "epsilon": self.epsilon} 176 177 178 class Adadelta(Optimizer): 179 '''Adadelta optimizer. 180 181 It is recommended to leave the parameters of this optimizer 182 at their default values. 183 184 # Arguments 185 lr: float >= 0. Learning rate. It is recommended to leave it at the default value. 186 rho: float >= 0. 187 epsilon: float >= 0. Fuzz factor. 188 189 # References 190 - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) 191 ''' 192 def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs): 193 super(Adadelta, self).__init__(**kwargs) 194 self.__dict__.update(locals()) 195 self.lr = K.variable(lr) 196 197 def get_updates(self, params, constraints, loss): 198 grads = self.get_gradients(loss, params) 199 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 200 delta_accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 201 self.updates = [] 202 203 for p, g, a, d_a, c in zip(params, grads, accumulators, 204 delta_accumulators, constraints): 205 # update accumulator 206 new_a = self.rho * a + (1 - self.rho) * K.square(g) 207 self.updates.append((a, new_a)) 208 209 # use the new accumulator and the *old* delta_accumulator 210 update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) 211 212 new_p = p - self.lr * update 213 self.updates.append((p, c(new_p))) # apply constraints 214 215 # update delta_accumulator 216 new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) 217 self.updates.append((d_a, new_d_a)) 218 return self.updates 219 220 def get_config(self): 221 return {"name": self.__class__.__name__, 222 "lr": float(K.get_value(self.lr)), 223 "rho": self.rho, 224 "epsilon": self.epsilon} 225 226 227 class Adam(Optimizer): 228 '''Adam optimizer. 229 230 Default parameters follow those provided in the original paper. 231 232 # Arguments 233 lr: float >= 0. Learning rate. 234 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 235 epsilon: float >= 0. Fuzz factor. 236 237 # References 238 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 239 ''' 240 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, 241 *args, **kwargs): 242 super(Adam, self).__init__(**kwargs) 243 self.__dict__.update(locals()) 244 self.iterations = K.variable(0) 245 self.lr = K.variable(lr) 246 self.beta_1 = K.variable(beta_1) 247 self.beta_2 = K.variable(beta_2) 248 249 def get_updates(self, params, constraints, loss): 250 grads = self.get_gradients(loss, params) 251 self.updates = [(self.iterations, self.iterations+1.)] 252 253 t = self.iterations + 1 254 lr_t = self.lr * K.sqrt(1 - K.pow(self.beta_2, t)) / (1 - K.pow(self.beta_1, t)) 255 256 for p, g, c in zip(params, grads, constraints): 257 # zero init of moment 258 m = K.variable(np.zeros(K.get_value(p).shape)) 259 # zero init of velocity 260 v = K.variable(np.zeros(K.get_value(p).shape)) 261 262 m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 263 v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) 264 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 265 266 self.updates.append((m, m_t)) 267 self.updates.append((v, v_t)) 268 self.updates.append((p, c(p_t))) # apply constraints 269 return self.updates 270 271 def get_config(self): 272 return {"name": self.__class__.__name__, 273 "lr": float(K.get_value(self.lr)), 274 "beta_1": float(K.get_value(self.beta_1)), 275 "beta_2": float(K.get_value(self.beta_2)), 276 "epsilon": self.epsilon} 277 278 279 class Adamax(Optimizer): 280 '''Adamax optimizer from Adam paper's Section 7. It is a variant 281 of Adam based on the infinity norm. 282 283 Default parameters follow those provided in the paper. 284 285 # Arguments 286 lr: float >= 0. Learning rate. 287 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 288 epsilon: float >= 0. Fuzz factor. 289 290 # References 291 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 292 ''' 293 def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8, 294 *args, **kwargs): 295 super(Adamax, self).__init__(**kwargs) 296 self.__dict__.update(locals()) 297 self.iterations = K.variable(0) 298 self.lr = K.variable(lr) 299 self.beta_1 = K.variable(beta_1) 300 self.beta_2 = K.variable(beta_2) 301 302 def get_updates(self, params, constraints, loss): 303 grads = self.get_gradients(loss, params) 304 self.updates = [(self.iterations, self.iterations+1.)] 305 306 t = self.iterations + 1 307 lr_t = self.lr / (1 - K.pow(self.beta_1, t)) 308 309 for p, g, c in zip(params, grads, constraints): 310 # zero init of 1st moment 311 m = K.variable(np.zeros(K.get_value(p).shape)) 312 # zero init of exponentially weighted infinity norm 313 u = K.variable(np.zeros(K.get_value(p).shape)) 314 315 m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 316 u_t = K.maximum(self.beta_2 * u, K.abs(g)) 317 p_t = p - lr_t * m_t / (u_t + self.epsilon) 318 319 self.updates.append((m, m_t)) 320 self.updates.append((u, u_t)) 321 self.updates.append((p, c(p_t))) # apply constraints 322 return self.updates 323 324 def get_config(self): 325 return {"name": self.__class__.__name__, 326 "lr": float(K.get_value(self.lr)), 327 "beta_1": float(K.get_value(self.beta_1)), 328 "beta_2": float(K.get_value(self.beta_2)), 329 "epsilon": self.epsilon} 330 331 332 # aliases 333 sgd = SGD 334 rmsprop = RMSprop 335 adagrad = Adagrad 336 adadelta = Adadelta 337 adam = Adam 338 adamax = Adamax 339 340 341 def get(identifier, kwargs=None): 342 return get_from_module(identifier, globals(), 'optimizer', 343 instantiate=True, kwargs=kwargs)
keras==1.2.0时的optimizer.py如下:
1 from __future__ import absolute_import 2 from . import backend as K 3 from .utils.generic_utils import get_from_module 4 from six.moves import zip 5 6 7 def clip_norm(g, c, n): 8 if c > 0: 9 g = K.switch(n >= c, g * c / n, g) 10 return g 11 12 13 def optimizer_from_config(config, custom_objects={}): 14 all_classes = { 15 'sgd': SGD, 16 'rmsprop': RMSprop, 17 'adagrad': Adagrad, 18 'adadelta': Adadelta, 19 'adam': Adam, 20 'adamax': Adamax, 21 'nadam': Nadam, 22 'tfoptimizer': TFOptimizer, 23 } 24 class_name = config['class_name'] 25 if class_name in custom_objects: 26 cls = custom_objects[class_name] 27 else: 28 if class_name.lower() not in all_classes: 29 raise ValueError('Optimizer class not found:', class_name) 30 cls = all_classes[class_name.lower()] 31 return cls.from_config(config['config']) 32 33 34 class Optimizer(object): 35 '''Abstract optimizer base class. 36 37 Note: this is the parent class of all optimizers, not an actual optimizer 38 that can be used for training models. 39 40 All Keras optimizers support the following keyword arguments: 41 42 clipnorm: float >= 0. Gradients will be clipped 43 when their L2 norm exceeds this value. 44 clipvalue: float >= 0. Gradients will be clipped 45 when their absolute value exceeds this value. 46 ''' 47 def __init__(self, **kwargs): 48 allowed_kwargs = {'clipnorm', 'clipvalue'} 49 for k in kwargs: 50 if k not in allowed_kwargs: 51 raise TypeError('Unexpected keyword argument ' 52 'passed to optimizer: ' + str(k)) 53 self.__dict__.update(kwargs) 54 self.updates = [] 55 self.weights = [] 56 57 def get_updates(self, params, constraints, loss): 58 raise NotImplementedError 59 60 def get_gradients(self, loss, params): 61 grads = K.gradients(loss, params) 62 if hasattr(self, 'clipnorm') and self.clipnorm > 0: 63 norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads])) 64 grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 65 if hasattr(self, 'clipvalue') and self.clipvalue > 0: 66 grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads] 67 return grads 68 69 def set_weights(self, weights): 70 '''Sets the weights of the optimizer, from Numpy arrays. 71 72 Should only be called after computing the gradients 73 (otherwise the optimizer has no weights). 74 75 # Arguments 76 weights: a list of Numpy arrays. The number 77 of arrays and their shape must match 78 number of the dimensions of the weights 79 of the optimizer (i.e. it should match the 80 output of `get_weights`). 81 ''' 82 params = self.weights 83 weight_value_tuples = [] 84 param_values = K.batch_get_value(params) 85 for pv, p, w in zip(param_values, params, weights): 86 if pv.shape != w.shape: 87 raise ValueError('Optimizer weight shape ' + 88 str(pv.shape) + 89 ' not compatible with ' 90 'provided weight shape ' + str(w.shape)) 91 weight_value_tuples.append((p, w)) 92 K.batch_set_value(weight_value_tuples) 93 94 def get_weights(self): 95 '''Returns the current weights of the optimizer, 96 as a list of numpy arrays. 97 ''' 98 return K.batch_get_value(self.weights) 99 100 def get_config(self): 101 config = {} 102 if hasattr(self, 'clipnorm'): 103 config['clipnorm'] = self.clipnorm 104 if hasattr(self, 'clipvalue'): 105 config['clipvalue'] = self.clipvalue 106 return config 107 108 @classmethod 109 def from_config(cls, config): 110 return cls(**config) 111 112 113 class SGD(Optimizer): 114 '''Stochastic gradient descent, with support for momentum, 115 learning rate decay, and Nesterov momentum. 116 117 # Arguments 118 lr: float >= 0. Learning rate. 119 momentum: float >= 0. Parameter updates momentum. 120 decay: float >= 0. Learning rate decay over each update. 121 nesterov: boolean. Whether to apply Nesterov momentum. 122 ''' 123 def __init__(self, lr=0.01, momentum=0., decay=0., 124 nesterov=False, **kwargs): 125 super(SGD, self).__init__(**kwargs) 126 self.__dict__.update(locals()) 127 self.iterations = K.variable(0.) 128 self.lr = K.variable(lr) 129 self.momentum = K.variable(momentum) 130 self.decay = K.variable(decay) 131 self.inital_decay = decay 132 133 def get_updates(self, params, constraints, loss): 134 grads = self.get_gradients(loss, params) 135 self.updates = [] 136 137 lr = self.lr 138 if self.inital_decay > 0: 139 lr *= (1. / (1. + self.decay * self.iterations)) 140 self.updates .append(K.update_add(self.iterations, 1)) 141 142 # momentum 143 shapes = [K.get_variable_shape(p) for p in params] 144 moments = [K.zeros(shape) for shape in shapes] 145 self.weights = [self.iterations] + moments 146 for p, g, m in zip(params, grads, moments): 147 v = self.momentum * m - lr * g # velocity 148 self.updates.append(K.update(m, v)) 149 150 if self.nesterov: 151 new_p = p + self.momentum * v - lr * g 152 else: 153 new_p = p + v 154 155 # apply constraints 156 if p in constraints: 157 c = constraints[p] 158 new_p = c(new_p) 159 160 self.updates.append(K.update(p, new_p)) 161 return self.updates 162 163 def get_config(self): 164 config = {'lr': float(K.get_value(self.lr)), 165 'momentum': float(K.get_value(self.momentum)), 166 'decay': float(K.get_value(self.decay)), 167 'nesterov': self.nesterov} 168 base_config = super(SGD, self).get_config() 169 return dict(list(base_config.items()) + list(config.items())) 170 171 172 class RMSprop(Optimizer): 173 '''RMSProp optimizer. 174 175 It is recommended to leave the parameters of this optimizer 176 at their default values 177 (except the learning rate, which can be freely tuned). 178 179 This optimizer is usually a good choice for recurrent 180 neural networks. 181 182 # Arguments 183 lr: float >= 0. Learning rate. 184 rho: float >= 0. 185 epsilon: float >= 0. Fuzz factor. 186 decay: float >= 0. Learning rate decay over each update. 187 ''' 188 def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., 189 **kwargs): 190 super(RMSprop, self).__init__(**kwargs) 191 self.__dict__.update(locals()) 192 self.lr = K.variable(lr) 193 self.rho = K.variable(rho) 194 self.decay = K.variable(decay) 195 self.inital_decay = decay 196 self.iterations = K.variable(0.) 197 198 def get_updates(self, params, constraints, loss): 199 grads = self.get_gradients(loss, params) 200 shapes = [K.get_variable_shape(p) for p in params] 201 accumulators = [K.zeros(shape) for shape in shapes] 202 self.weights = accumulators 203 self.updates = [] 204 205 lr = self.lr 206 if self.inital_decay > 0: 207 lr *= (1. / (1. + self.decay * self.iterations)) 208 self.updates.append(K.update_add(self.iterations, 1)) 209 210 for p, g, a in zip(params, grads, accumulators): 211 # update accumulator 212 new_a = self.rho * a + (1. - self.rho) * K.square(g) 213 self.updates.append(K.update(a, new_a)) 214 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 215 216 # apply constraints 217 if p in constraints: 218 c = constraints[p] 219 new_p = c(new_p) 220 self.updates.append(K.update(p, new_p)) 221 return self.updates 222 223 def get_config(self): 224 config = {'lr': float(K.get_value(self.lr)), 225 'rho': float(K.get_value(self.rho)), 226 'decay': float(K.get_value(self.decay)), 227 'epsilon': self.epsilon} 228 base_config = super(RMSprop, self).get_config() 229 return dict(list(base_config.items()) + list(config.items())) 230 231 232 class Adagrad(Optimizer): 233 '''Adagrad optimizer. 234 235 It is recommended to leave the parameters of this optimizer 236 at their default values. 237 238 # Arguments 239 lr: float >= 0. Learning rate. 240 epsilon: float >= 0. 241 242 # References 243 - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 244 ''' 245 def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs): 246 super(Adagrad, self).__init__(**kwargs) 247 self.__dict__.update(locals()) 248 self.lr = K.variable(lr) 249 self.decay = K.variable(decay) 250 self.inital_decay = decay 251 self.iterations = K.variable(0.) 252 253 def get_updates(self, params, constraints, loss): 254 grads = self.get_gradients(loss, params) 255 shapes = [K.get_variable_shape(p) for p in params] 256 accumulators = [K.zeros(shape) for shape in shapes] 257 self.weights = accumulators 258 self.updates = [] 259 260 lr = self.lr 261 if self.inital_decay > 0: 262 lr *= (1. / (1. + self.decay * self.iterations)) 263 self.updates.append(K.update_add(self.iterations, 1)) 264 265 for p, g, a in zip(params, grads, accumulators): 266 new_a = a + K.square(g) # update accumulator 267 self.updates.append(K.update(a, new_a)) 268 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 269 # apply constraints 270 if p in constraints: 271 c = constraints[p] 272 new_p = c(new_p) 273 self.updates.append(K.update(p, new_p)) 274 return self.updates 275 276 def get_config(self): 277 config = {'lr': float(K.get_value(self.lr)), 278 'decay': float(K.get_value(self.decay)), 279 'epsilon': self.epsilon} 280 base_config = super(Adagrad, self).get_config() 281 return dict(list(base_config.items()) + list(config.items())) 282 283 284 class Adadelta(Optimizer): 285 '''Adadelta optimizer. 286 287 It is recommended to leave the parameters of this optimizer 288 at their default values. 289 290 # Arguments 291 lr: float >= 0. Learning rate. 292 It is recommended to leave it at the default value. 293 rho: float >= 0. 294 epsilon: float >= 0. Fuzz factor. 295 296 # References 297 - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) 298 ''' 299 def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., 300 **kwargs): 301 super(Adadelta, self).__init__(**kwargs) 302 self.__dict__.update(locals()) 303 self.lr = K.variable(lr) 304 self.decay = K.variable(decay) 305 self.inital_decay = decay 306 self.iterations = K.variable(0.) 307 308 def get_updates(self, params, constraints, loss): 309 grads = self.get_gradients(loss, params) 310 shapes = [K.get_variable_shape(p) for p in params] 311 accumulators = [K.zeros(shape) for shape in shapes] 312 delta_accumulators = [K.zeros(shape) for shape in shapes] 313 self.weights = accumulators + delta_accumulators 314 self.updates = [] 315 316 lr = self.lr 317 if self.inital_decay > 0: 318 lr *= (1. / (1. + self.decay * self.iterations)) 319 self.updates.append(K.update_add(self.iterations, 1)) 320 321 for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 322 # update accumulator 323 new_a = self.rho * a + (1. - self.rho) * K.square(g) 324 self.updates.append(K.update(a, new_a)) 325 326 # use the new accumulator and the *old* delta_accumulator 327 update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) 328 329 new_p = p - lr * update 330 # apply constraints 331 if p in constraints: 332 c = constraints[p] 333 new_p = c(new_p) 334 self.updates.append(K.update(p, new_p)) 335 336 # update delta_accumulator 337 new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) 338 self.updates.append(K.update(d_a, new_d_a)) 339 return self.updates 340 341 def get_config(self): 342 config = {'lr': float(K.get_value(self.lr)), 343 'rho': self.rho, 344 'decay': float(K.get_value(self.decay)), 345 'epsilon': self.epsilon} 346 base_config = super(Adadelta, self).get_config() 347 return dict(list(base_config.items()) + list(config.items())) 348 349 350 class Adam(Optimizer): 351 '''Adam optimizer. 352 353 Default parameters follow those provided in the original paper. 354 355 # Arguments 356 lr: float >= 0. Learning rate. 357 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 358 epsilon: float >= 0. Fuzz factor. 359 360 # References 361 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 362 ''' 363 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, 364 epsilon=1e-8, decay=0., **kwargs): 365 super(Adam, self).__init__(**kwargs) 366 self.__dict__.update(locals()) 367 self.iterations = K.variable(0) 368 self.lr = K.variable(lr) 369 self.beta_1 = K.variable(beta_1) 370 self.beta_2 = K.variable(beta_2) 371 self.decay = K.variable(decay) 372 self.inital_decay = decay 373 374 def get_updates(self, params, constraints, loss): 375 grads = self.get_gradients(loss, params) 376 self.updates = [K.update_add(self.iterations, 1)] 377 378 lr = self.lr 379 if self.inital_decay > 0: 380 lr *= (1. / (1. + self.decay * self.iterations)) 381 382 t = self.iterations + 1 383 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) 384 385 shapes = [K.get_variable_shape(p) for p in params] 386 ms = [K.zeros(shape) for shape in shapes] 387 vs = [K.zeros(shape) for shape in shapes] 388 self.weights = [self.iterations] + ms + vs 389 390 for p, g, m, v in zip(params, grads, ms, vs): 391 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 392 v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 393 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 394 395 self.updates.append(K.update(m, m_t)) 396 self.updates.append(K.update(v, v_t)) 397 398 new_p = p_t 399 # apply constraints 400 if p in constraints: 401 c = constraints[p] 402 new_p = c(new_p) 403 self.updates.append(K.update(p, new_p)) 404 return self.updates 405 406 def get_config(self): 407 config = {'lr': float(K.get_value(self.lr)), 408 'beta_1': float(K.get_value(self.beta_1)), 409 'beta_2': float(K.get_value(self.beta_2)), 410 'decay': float(K.get_value(self.decay)), 411 'epsilon': self.epsilon} 412 base_config = super(Adam, self).get_config() 413 return dict(list(base_config.items()) + list(config.items())) 414 415 416 class Adamax(Optimizer): 417 '''Adamax optimizer from Adam paper's Section 7. It is a variant 418 of Adam based on the infinity norm. 419 420 Default parameters follow those provided in the paper. 421 422 # Arguments 423 lr: float >= 0. Learning rate. 424 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 425 epsilon: float >= 0. Fuzz factor. 426 427 # References 428 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 429 ''' 430 def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, 431 epsilon=1e-8, decay=0., **kwargs): 432 super(Adamax, self).__init__(**kwargs) 433 self.__dict__.update(locals()) 434 self.iterations = K.variable(0.) 435 self.lr = K.variable(lr) 436 self.beta_1 = K.variable(beta_1) 437 self.beta_2 = K.variable(beta_2) 438 self.decay = K.variable(decay) 439 self.inital_decay = decay 440 441 def get_updates(self, params, constraints, loss): 442 grads = self.get_gradients(loss, params) 443 self.updates = [K.update_add(self.iterations, 1)] 444 445 lr = self.lr 446 if self.inital_decay > 0: 447 lr *= (1. / (1. + self.decay * self.iterations)) 448 449 t = self.iterations + 1 450 lr_t = lr / (1. - K.pow(self.beta_1, t)) 451 452 shapes = [K.get_variable_shape(p) for p in params] 453 # zero init of 1st moment 454 ms = [K.zeros(shape) for shape in shapes] 455 # zero init of exponentially weighted infinity norm 456 us = [K.zeros(shape) for shape in shapes] 457 self.weights = [self.iterations] + ms + us 458 459 for p, g, m, u in zip(params, grads, ms, us): 460 461 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 462 u_t = K.maximum(self.beta_2 * u, K.abs(g)) 463 p_t = p - lr_t * m_t / (u_t + self.epsilon) 464 465 self.updates.append(K.update(m, m_t)) 466 self.updates.append(K.update(u, u_t)) 467 468 new_p = p_t 469 # apply constraints 470 if p in constraints: 471 c = constraints[p] 472 new_p = c(new_p) 473 self.updates.append(K.update(p, new_p)) 474 return self.updates 475 476 def get_config(self): 477 config = {'lr': float(K.get_value(self.lr)), 478 'beta_1': float(K.get_value(self.beta_1)), 479 'beta_2': float(K.get_value(self.beta_2)), 480 'decay': float(K.get_value(self.decay)), 481 'epsilon': self.epsilon} 482 base_config = super(Adamax, self).get_config() 483 return dict(list(base_config.items()) + list(config.items())) 484 485 486 class Nadam(Optimizer): 487 ''' 488 Nesterov Adam optimizer: Much like Adam is essentially RMSprop with momentum, 489 Nadam is Adam RMSprop with Nesterov momentum. 490 491 Default parameters follow those provided in the paper. 492 It is recommended to leave the parameters of this optimizer 493 at their default values. 494 495 # Arguments 496 lr: float >= 0. Learning rate. 497 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 498 epsilon: float >= 0. Fuzz factor. 499 500 # References 501 - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf) 502 - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf) 503 ''' 504 def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, 505 epsilon=1e-8, schedule_decay=0.004, **kwargs): 506 super(Nadam, self).__init__(**kwargs) 507 self.__dict__.update(locals()) 508 self.iterations = K.variable(0.) 509 self.m_schedule = K.variable(1.) 510 self.lr = K.variable(lr) 511 self.beta_1 = K.variable(beta_1) 512 self.beta_2 = K.variable(beta_2) 513 self.schedule_decay = schedule_decay 514 515 def get_updates(self, params, constraints, loss): 516 grads = self.get_gradients(loss, params) 517 self.updates = [K.update_add(self.iterations, 1)] 518 519 t = self.iterations + 1 520 521 # Due to the recommendations in [2], i.e. warming momentum schedule 522 momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * self.schedule_decay))) 523 momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay))) 524 m_schedule_new = self.m_schedule * momentum_cache_t 525 m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 526 self.updates.append((self.m_schedule, m_schedule_new)) 527 528 shapes = [K.get_variable_shape(p) for p in params] 529 ms = [K.zeros(shape) for shape in shapes] 530 vs = [K.zeros(shape) for shape in shapes] 531 532 self.weights = [self.iterations] + ms + vs 533 534 for p, g, m, v in zip(params, grads, ms, vs): 535 # the following equations given in [1] 536 g_prime = g / (1. - m_schedule_new) 537 m_t = self.beta_1 * m + (1. - self.beta_1) * g 538 m_t_prime = m_t / (1. - m_schedule_next) 539 v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) 540 v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) 541 m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime 542 543 self.updates.append(K.update(m, m_t)) 544 self.updates.append(K.update(v, v_t)) 545 546 p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) 547 new_p = p_t 548 549 # apply constraints 550 if p in constraints: 551 c = constraints[p] 552 new_p = c(new_p) 553 self.updates.append(K.update(p, new_p)) 554 return self.updates 555 556 def get_config(self): 557 config = {'lr': float(K.get_value(self.lr)), 558 'beta_1': float(K.get_value(self.beta_1)), 559 'beta_2': float(K.get_value(self.beta_2)), 560 'epsilon': self.epsilon, 561 'schedule_decay': self.schedule_decay} 562 base_config = super(Nadam, self).get_config() 563 return dict(list(base_config.items()) + list(config.items())) 564 565 566 class TFOptimizer(Optimizer): 567 568 def __init__(self, optimizer): 569 self.optimizer = optimizer 570 self.iterations = K.variable(0.) 571 self.updates = [] 572 573 def get_updates(self, params, constraints, loss): 574 if constraints: 575 raise ValueError('TF optimizers do not support ' 576 'weights constraints. Either remove ' 577 'all weights constraints in your model, ' 578 'or use a Keras optimizer.') 579 grads = self.optimizer.compute_gradients(loss, params) 580 opt_update = self.optimizer.apply_gradients( 581 grads, global_step=self.iterations) 582 self.updates.append(opt_update) 583 return self.updates 584 585 @property 586 def weights(self): 587 raise NotImplementedError 588 589 def get_config(self): 590 raise NotImplementedError 591 592 def from_config(self, config): 593 raise NotImplementedError 594 595 596 # aliases 597 sgd = SGD 598 rmsprop = RMSprop 599 adagrad = Adagrad 600 adadelta = Adadelta 601 adam = Adam 602 adamax = Adamax 603 nadam = Nadam 604 605 606 def get(identifier, kwargs=None): 607 if K.backend() == 'tensorflow': 608 # Wrap TF optimizer instances 609 import tensorflow as tf 610 if isinstance(identifier, tf.train.Optimizer): 611 return TFOptimizer(identifier) 612 # Instantiate a Keras optimizer 613 return get_from_module(identifier, globals(), 'optimizer', 614 instantiate=True, kwargs=kwargs)