研究一下不同梯度下降的优化器和自己的优化器的比较(SGD,Momentum,AdaGrad,Adam)
研究一下不同梯度下降的优化器和自己的优化器的比较(SGD,Momentum,AdaGrad,Adam)
参考:《深度学习入门:基于Python的理论与实现》
import matplotlib.pyplot as plt
import numpy as np
import latexify
from collections import OrderedDict
from common.optimizer import *
一维函数的优化
定义函数
def J(x):
return x**6/6 - 5.5*x**5/5 + 6.5*x**4/4 + 5.5*x**3/3 - 7.5*x**2/2
def dJ(x):
return x**5 - 5.5*x**4 + 6.5*x**3 + 5.5*x**2 - 7.5*x
def ddJ(x):
return 5*x**4 - 22*x**3 + 19.5*x**2 + 11*x - 7.5
x = np.linspace(-2, 5, 100)
plt.figure(figsize=(8, 6))
plt.plot(x, J(x), label='J(x)')
plt.plot(x, dJ(x), label='dJ(x)')
plt.plot(x, ddJ(x), label='ddJ(x)')
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.title('J(x) and dJ(x) and ddJ(x)')
plt.ylim(-5, 20)
plt.show()
可视化迭代过程
# 创建一个动画,将梯度下降的过程可视化,这里使用的是matplotlib的animation模块
from matplotlib import animation
from IPython.display import HTML
def show_animation(x_history):
fig = plt.figure(figsize=(8, 6))
ax = plt.axes(xlim=(-2, 5), ylim=(-5, 30))
line, = ax.plot([], [], 'bo', lw=2, label='point')
x = np.linspace(-2, 5, 100)
plt.legend()
def init():
line.set_data([], [])
return line,
def animate(frame):
line.set_data(x_history[frame], J(x_history[frame]))
plt.title('iteration = {}'.format(frame))
return line,
anim = animation.FuncAnimation(fig, animate, init_func=init,
frames=len(x_history), # 这里的frames是指动画的帧数
interval=40, # 这里的interval是指动画的间隔时间 单位是ms
blit=True # 这里的blit是指是否只更新动画中改变的部分
)
return HTML(anim.to_html5_video())
# test
x_history = {
'AdaGrad':[1,2,3],
'SGD':[1,2,3],
'Momentum':[1,2,3],
'Nesterov':[1,2,3],
'Adam': [1,2,3]
}
print(len(x_history.keys()))
print(len(x_history))
5
5
SGD
简单实现
alpha = 0.05
x = 4
x_history = [x]
for i in range(200):
x = x - alpha * dJ(x)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
-1.0
优化器实现
init_pos = 4
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = SGD(lr=0.05)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
-1.0
Momentum
简单实现
v = 0
alpha = 0.02
beta = 0.9
x = 4
x_history = [x]
for i in range(200):
v = beta * v - alpha * dJ(x)
x = x + v
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
-0.9999435339008631
优化器实现
init_pos = 4
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = Momentum(lr=0.02)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
-0.9999435339008631
AdaGrad
简单实现
alpha = 0.5
x = 4
h = 0
x_history = [x]
for i in range(200):
h = h + dJ(x)**2
x = x - alpha * dJ(x) / (np.sqrt(h)+1e-7)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
3.0000003436659832
优化器实现
init_pos = np.float64(4)
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = AdaGrad(lr=0.5)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
3.0000003436659832
小结
我们可以看到上面AdaGrad的学习率在迭代的过程中不断减小,这是因为AdaGrad会对每个参数的梯度进行累加,从而使得学习率不断减小,这样的话,我们就可以使用一个较大的学习率,从而加快学习的速度。但是也有个缺点,我们可以从视频中看到其飞快地收敛于局部最优,这是值得我们思考的地方:我们能不能利用AdaGrad的优点,同时避免其缺点呢?即用较大学习率加快前期学习,同时也引入动量的概念,使得其一直朝着全局最优的方向前进呢?这就是Adam的思想。
Adam
简单实现
alpha = 0.5
beta1 = 0.9
beta2 = 0.999
m = 0
v = 0
x = 4
x_history = [x]
for i in range(200):
m += (1-beta1) * (dJ(x) - m)
v += (1-beta2) * (dJ(x)**2 - v)
lr_t = alpha * np.sqrt(1-beta2**(i+1)) / (1-beta1**(i+1))
x = x - lr_t * m / (np.sqrt(v)+1e-7)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
0.9999331921434963
优化器实现
init_pos = np.float64(4)
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = Adam(lr=0.5)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
0.9999331921434963
可以看到在和AdaGrad相同学习率(lr = 0.5 )的情况下,Adam越过了第一个坡,而AdaGrad则没有。下面我们只需要再调整一下学习率为0.8,就可以看到Adam和AdaGrad的差距了。
init_pos = np.float64(4)
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = Adam(lr=0.8)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
-0.9999825571870972
从视频中可以看到Adam收敛到了全局最优,而AdaGrad则没有。
综合说明与对比
上面的每个优化方法中的简单实现与优化器实现的结果是完全一致的,而我们做的只不过是将优化器的实现封装成了一个类,这样我们就可以方便地调用不同的优化器了。
下面我们同时画出四个函数的迭代动画过程,让大家可以直观地感受一下不同优化器的优劣。
optimizers = {}
optimizers['SGD'] = SGD(lr=0.05)
optimizers['Momentum'] = Momentum(lr=0.02)
optimizers['AdaGrad'] = AdaGrad(lr=0.5)
optimizers['Adam'] = Adam(lr=0.8)
x_history = {}
for key in optimizers:
params = {}
params['x'] = init_pos
grads = {}
x_history[key] = []
optimizer = optimizers[key]
for i in range(200):
x_history[key].append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(key, params['x'])
SGD -1.0
Momentum -0.9999435339008631
AdaGrad 3.0000003436659832
Adam -0.9999825571870972
# 创建一个动画,将梯度下降的过程可视化,这里使用的是matplotlib的animation模块
from matplotlib import animation
from IPython.display import HTML
def show_mult_animation(x_history):
# @param x_history: 一个字典,key是优化器的名字,value是对应优化器的x_history
# @return: 一个动画,将梯度下降的过程可视化
fig = plt.figure(figsize=(20, 10))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
axlist = [ax1, ax2, ax3, ax4]
linelist = []
for i in range(4):
ax = axlist[i]
ax.set_xlim(-2, 5)
ax.set_ylim(-5, 30)
ax.set_xlabel('x')
ax.set_ylabel('J(x)')
x = np.linspace(-2, 5, 100)
ax.plot(x, J(x), 'b-', lw=1, label='J(x)')
line, = ax.plot([], [], 'bo', lw=2, label='point')
linelist.append(line)
ax.legend()
ax.set_title(list(x_history.keys())[i] + ' optimizer')
def init():
for line in linelist:
line.set_data([], [])
return linelist
def animate(frame):
line_index = 0
for key in x_history.keys():
x = x_history[key][frame]
y = J(x)
line = linelist[line_index]
line_index += 1
line.set_data(x, y)
fig.suptitle('iterations: ' + str(frame))
return linelist
anim = animation.FuncAnimation(fig, animate, init_func=init,
frames=200, # 这里的frames是指动画的帧数
interval=40, # 这里的interval是指动画的间隔时间 单位是ms
blit=True # 这里的blit是指是否只更新动画中改变的部分
)
return HTML(anim.to_html5_video())
show_mult_animation(x_history)
自己的优化器PGD(Physical Gradient Descent)
def physisc_grad_descent(J, grad_J, gradgrad_J, x0=4, dt = 0.05, g = 9.8, mu = 0.1, n_iters=75, is_print=False):
x = x0
v_square = 0
vx = 0
x_history = np.array([x])
for i in range(n_iters):
if is_print:
print("该点的水平位置为:", x,end=' ')
print("该点的水平速度为:", vx,end=' ')
print("该点的动能为:", 0.5 * v_square)
prex = x
# update x
x = x + dt * vx
v_square = v_square + 2 * g * (J(prex)-J(x)) - 2 * mu * (g + v_square * gradgrad_J(x) / (1 + grad_J(x)**2))*abs(x-prex)
if v_square <= 0:
v_square = 0
vx = g * (-grad_J(x)) / (1 + (grad_J(x))**2) * dt
else:
if x - prex < 0:
vx = -np.sqrt(v_square) / np.sqrt(1 + (grad_J(x))**2 )
else:
vx = np.sqrt(v_square) / np.sqrt(1 + (grad_J(x))**2 )
x_history = np.append(x_history, x)
return x_history
x_history = physisc_grad_descent(J, dJ, ddJ, x0=4, dt = 0.05, g = 9.8, mu = 0.2, n_iters=200, is_print=False)
show_animation(x_history)
总结而言,自己这样写还是太局限了,因为还要知道自己的二阶导数的信息,这个对于大多数现有的学习策略来说是不可知的,所以这个方法还是不太可行的。
二维导数的优化
定义函数
def f(x, y):
return x**2 / 20.0 + y**2
def df(x, y):
return x / 10.0, 2.0*y
init_pos = (-7.0, 2.0)
params = {}
params['x'], params['y'] = init_pos[0], init_pos[1]
grads = {}
grads['x'], grads['y'] = 0, 0
optimizers = OrderedDict()
optimizers["SGD"] = SGD(lr=0.95)
optimizers["Momentum"] = Momentum(lr=0.1)
optimizers["AdaGrad"] = AdaGrad(lr=1.5)
optimizers["Adam"] = Adam(lr=0.3)
idx = 1
plt.figure(figsize=(20,15))
for key in optimizers:
optimizer = optimizers[key]
x_history = []
y_history = []
params['x'], params['y'] = init_pos[0], init_pos[1]
for i in range(200):
x_history.append(params['x'])
y_history.append(params['y'])
grads['x'], grads['y'] = df(params['x'], params['y'])
optimizer.update(params, grads)
# 输出最终位置
print(key + ":" + str(params['x']) + "," + str(params['y']))
x = np.arange(-10, 10, 0.01)
y = np.arange(-5, 5, 0.01)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
# for simple contour line
mask = Z > 7
Z[mask] = 0
# plot
plt.subplot(2, 2, idx)
idx += 1
plt.plot(x_history, y_history, 'o-', color="red")
plt.contour(X, Y, Z)
plt.ylim(-10, 10)
plt.xlim(-10, 10)
plt.plot(0, 0, '+')
#colorbar()
#spring()
plt.title(key)
plt.xlabel("x")
plt.ylabel("y")
plt.show()
SGD:-1.4955945746380926e-08,1.4110158217310404e-09
Momentum:4.525745158046888e-05,-2.984927273613453e-05
AdaGrad:-0.00021640235526721696,1.0811148417095901e-41
Adam:-0.00011915420080310103,3.3543852840193107e-05
可视化迭代过程
# 创建一个动画,将梯度下降的过程可视化,这里使用的是matplotlib的animation模块
from matplotlib import animation
from IPython.display import HTML
def show_animation(x_history,y_history,key):
fig = plt.figure(figsize=(8, 6))
ax = plt.axes(xlim=(-2, 5), ylim=(-5, 30))
line, = ax.plot([], [], 'ro', lw=2, label='point')
x = np.arange(-10, 10, 0.01)
y = np.arange(-5, 5, 0.01)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
# for simple contour line
mask = Z > 7
Z[mask] = 0
plt.contour(X, Y, Z)
plt.ylim(-10, 10)
plt.xlim(-10, 10)
plt.plot(0, 0, '+')
plt.xlabel("x")
plt.ylabel("y")
plt.title(key)
plt.legend()
def init():
line.set_data([], [])
return line,
def animate(frame):
line.set_data(x_history[frame], y_history[frame])
ax.set_title("frame: {}".format(frame))
return line,
anim = animation.FuncAnimation(fig, animate, init_func=init,
frames=len(x_history), # 这里的frames是指动画的帧数
interval=40, # 这里的interval是指动画的间隔时间 单位是ms
blit=True # 这里的blit是指是否只更新动画中改变的部分
)
return anim
anim_list = []
for key in optimizers:
optimizer = optimizers[key]
x_history = []
y_history = []
params['x'], params['y'] = init_pos[0], init_pos[1]
for i in range(200):
x_history.append(params['x'])
y_history.append(params['y'])
grads['x'], grads['y'] = df(params['x'], params['y'])
optimizer.update(params, grads)
# 输出最终位置
print(key + ":" + str(params['x']) + "," + str(params['y']))
anim_list.append(show_animation(x_history,y_history,key))
SGD:-1.4955945746380926e-08,1.4110158217310404e-09
Momentum:3.0175123211357402e-05,-2.6841550736447172e-05
AdaGrad:-0.02598841542479064,8.887975802048801e-24
Adam:-0.00018076469204204358,5.470947760432499e-05
HTML(anim_list[0].to_html5_video())
HTML(anim_list[1].to_html5_video())
HTML(anim_list[2].to_html5_video())
HTML(anim_list[3].to_html5_video())
# 生成动画gif
anim_list[0].save('SGD.gif', writer='imagemagick', fps=30)
anim_list[1].save('Momentum.gif', writer='imagemagick', fps=30)
anim_list[2].save('AdaGrad.gif', writer='imagemagick', fps=30)
anim_list[3].save('Adam.gif', writer='imagemagick', fps=30)
MovieWriter imagemagick unavailable; using Pillow instead.
MovieWriter imagemagick unavailable; using Pillow instead.
MovieWriter imagemagick unavailable; using Pillow instead.
MovieWriter imagemagick unavailable; using Pillow instead.
总结,这四个优化器各有优点,但是在实际应用中,我们可以根据自己的需要,选择不同的优化器,比如在深度学习中,我们一般使用Adam,因为它的收敛速度比较快,而且在大多数情况下都能收敛到全局最优,而一些特殊的情况下,我们可以使用SGD,因为它的收敛速度比较慢,但是在一些特殊的情况下,它依然能够收敛到全局最优。
%%HTML
<img src="SGD.gif">
<img src="Momentum.gif">
<img src="AdaGrad.gif">
<img src="Adam.gif">