各类梯度下降算法的numpy实现

layout: post
title: 深度学习
subtitle: 梯度下降算法实现
description: 梯度下降算法实现
date: 2022-10-25
categories: deeplearning
tags: code pytorch
comments: true

摘录自CS231N

def sgd(w, dw, config=None):

   """

   Performs vanilla stochastic gradient descent.

   config format:

   - learning_rate: Scalar learning rate.

   """

   if config is None:

      config = {}

   config.setdefault("learning_rate", 1e-2)

   w -= config["learning_rate"] * dw

   return w, config

def sgd_momentum(w, dw, config=None):

   """

   Performs stochastic gradient descent with momentum.

   config format:

   - learning_rate: Scalar learning rate.

   - momentum: Scalar between 0 and 1 giving the momentum value.

     Setting momentum = 0 reduces to sgd.

   - velocity: A numpy array of the same shape as w and dw used to store a

     moving average of the gradients.

   """

   if config is None:

      config = {}

   config.setdefault("learning_rate", 1e-2)

   config.setdefault("momentum", 0.9)

   v = config.get("velocity", np.zeros_like(w))

   next_w = None

   ###########################################################################

   # TODO: Implement the momentum update formula. Store the updated value in #

   # the next_w variable. You should also use and update the velocity v.    #

   ###########################################################################

   # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   pass

   v = config['momentum'] * v + (1 - config['momentum']) * dw

   next_w = w - config['learning_rate'] * v

   # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   ###########################################################################

   #                    END OF YOUR CODE                   #

   ###########################################################################

   config["velocity"] = v

   return next_w, config

def rmsprop(w, dw, config=None):

   """

   Uses the RMSProp update rule, which uses a moving average of squared

   gradient values to set adaptive per-parameter learning rates.

   config format:

   - learning_rate: Scalar learning rate.

   - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared

     gradient cache.

   - epsilon: Small scalar used for smoothing to avoid dividing by zero.

   - cache: Moving average of second moments of gradients.

   """

   if config is None:

      config = {}

   config.setdefault("learning_rate", 1e-2)

   config.setdefault("decay_rate", 0.99)

   config.setdefault("epsilon", 1e-8)

   config.setdefault("cache", np.zeros_like(w))

   next_w = None

   ###########################################################################

   # TODO: Implement the RMSprop update formula, storing the next value of w #

   # in the next_w variable. Don't forget to update cache value stored in   #

   # config['cache'].                                      #

   ###########################################################################

   # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   pass

   config['cache'] = config['decay_rate'] * config['cache'] + (1 - config['decay_rate']) * dw ** 2

   next_w = w - config['learning_rate'] * dw / (np.sqrt(config['cache']) + config['epsilon'])

   # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   ###########################################################################

   #                    END OF YOUR CODE                   #

   ###########################################################################

   return next_w, config

def adam(w, dw, config=None):

   """

   Uses the Adam update rule, which incorporates moving averages of both the

   gradient and its square and a bias correction term.

   config format:

   - learning_rate: Scalar learning rate.

   - beta1: Decay rate for moving average of first moment of gradient.

   - beta2: Decay rate for moving average of second moment of gradient.

   - epsilon: Small scalar used for smoothing to avoid dividing by zero.

   - m: Moving average of gradient.

   - v: Moving average of squared gradient.

   - t: Iteration number.

   """

   if config is None:

      config = {}

   config.setdefault("learning_rate", 1e-3)

   config.setdefault("beta1", 0.9)

   config.setdefault("beta2", 0.999)

   config.setdefault("epsilon", 1e-8)

   config.setdefault("m", np.zeros_like(w))

   config.setdefault("v", np.zeros_like(w))

   config.setdefault("t", 0)

   next_w = None

   ###########################################################################

   # TODO: Implement the Adam update formula, storing the next value of w in #

   # the next_w variable. Don't forget to update the m, v, and t variables   #

   # stored in config.                                     #

   #                                                 #

   # NOTE: In order to match the reference output, please modify t _before_  #

   # using it in any calculations.                             #

   ###########################################################################

   # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   pass

   config['t'] += 1

   config['m'] = config['beta1'] * config['m'] + (1 - config['beta1']) * dw

   config['v'] = config['beta2'] * config['v'] + (1 - config['beta2']) * dw ** 2

   next_w = w - config['learning_rate'] * config['m'] / (np.sqrt(config['v']) + config['epsilon'])

   # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

   ###########################################################################

   #                    END OF YOUR CODE                   #

   ###########################################################################

   return next_w, config
posted @ 2023-02-27 13:49  cyinen  阅读(8)  评论(0编辑  收藏  举报