梯度下降
(Boyd & Vandenberghe, 2004 )
view code %matplotlib inline
import numpy as np
import torch
import time
from torch import nn, optim
import math
import sys
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize' ] = (9 , 6 )
一维梯度下降
证明:沿梯度反方向移动自变量可以减小函数值
泰勒展开:
\[f(x+\epsilon)=f(x)+\epsilon f^{\prime}(x)+\mathcal{O}\left(\epsilon^{2}\right)
\]
代入沿梯度方向的移动量 \(\eta f^{\prime}(x)\) :
\[f\left(x-\eta f^{\prime}(x)\right)=f(x)-\eta f^{\prime 2}(x)+\mathcal{O}\left(\eta^{2} f^{\prime 2}(x)\right)
\]
\[f\left(x-\eta f^{\prime}(x)\right) \lesssim f(x)
\]
\[x \leftarrow x-\eta f^{\prime}(x)
\]
e.g.
\[f(x) = x^2
\]
view code def f (x ):
return x**2
def gradf (x ):
return 2 * x
def gd (eta ):
x = 10
results = [x]
for i in range (10 ):
x -= eta * gradf(x)
results.append(x)
print ('epoch 10, x:' , x)
return results
res = gd(0.2 )
view code def show_trace (res ):
n = max (abs (min (res)), abs (max (res)))
f_line = np.arange(-n, n, 0.01 )
plt.plot(f_line, [f(x) for x in f_line],'-' )
plt.plot(res, [f(x) for x in res],'-o' )
plt.xlabel('x' )
plt.ylabel('f(x)' )
show_trace(res)
学习率
view code show_trace(gd(0.05 ))
view code show_trace(gd(1.1 ))
局部极小值
e.g.
\[f(x) = x\cos cx
\]
view code c = 0.15 * np.pi
def f (x ):
return x * np.cos(c * x)
def gradf (x ):
return np.cos(c * x) - c * x * np.sin(c * x)
show_trace(gd(2 ))
多维梯度下降
\[\nabla f(\mathbf{x})=\left[\frac{\partial f(\mathbf{x})}{\partial x_{1}}, \frac{\partial f(\mathbf{x})}{\partial x_{2}}, \dots, \frac{\partial f(\mathbf{x})}{\partial x_{d}}\right]^{\top}
\]
\[f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\mathcal{O}\left(\|\epsilon\|^{2}\right)
\]
\[\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f(\mathbf{x})
\]
view code def train_2d (trainer, steps=20 ):
x1, x2 = -5 , -2
results = [(x1, x2)]
for i in range (steps):
x1, x2 = trainer(x1, x2)
results.append((x1, x2))
print ('epoch %d, x1 %f, x2 %f' % (i + 1 , x1, x2))
return results
def show_trace_2d (f, results ):
plt.plot(*zip (*results), '-o' , color='#ff7f0e' )
x1, x2 = np.meshgrid(np.arange(-5.5 , 1.0 , 0.1 ), np.arange(-3.0 , 1.0 , 0.1 ))
plt.contour(x1, x2, f(x1, x2), colors='#1f77b4' )
plt.xlabel('x1' )
plt.ylabel('x2' )
\[f(x) = x_1^2 + 2x_2^2
\]
view code eta = 0.1
def f_2d (x1, x2 ):
return x1 ** 2 + 2 * x2 ** 2
def gd_2d (x1, x2 ):
return (x1 - eta * 2 * x1, x2 - eta * 4 * x2)
show_trace_2d(f_2d, train_2d(gd_2d))
自适应方法
牛顿法
在 \(x + \epsilon\) 处泰勒展开:
\[f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\frac{1}{2} \epsilon^{\top} \nabla \nabla^{\top} f(\mathbf{x}) \epsilon+\mathcal{O}\left(\|\epsilon\|^{3}\right)
\]
最小值点处满足: \(\nabla f(\mathbf{x})=0\) , 即我们希望 \(\nabla f(\mathbf{x} + \epsilon)=0\) , 对上式关于 \(\epsilon\) 求导,忽略高阶无穷小,有:
\[\nabla f(\mathbf{x})+\boldsymbol{H}_{f} \boldsymbol{\epsilon}=0 \text { and hence } \epsilon=-\boldsymbol{H}_{f}^{-1} \nabla f(\mathbf{x})
\]
view code c = 0.5
def f (x ):
return np.cosh(c * x)
def gradf (x ):
return c * np.sinh(c * x)
def hessf (x ):
return c**2 * np.cosh(c * x)
def newton (eta=1 ):
x = 10
results = [x]
for i in range (10 ):
x -= eta * gradf(x) / hessf(x)
results.append(x)
print ('epoch 10, x:' , x)
return results
show_trace(newton())
view code c = 0.15 * np.pi
def f (x ):
return x * np.cos(c * x)
def gradf (x ):
return np.cos(c * x) - c * x * np.sin(c * x)
def hessf (x ):
return - 2 * c * np.sin(c * x) - x * c**2 * np.cos(c * x)
show_trace(newton())
view code show_trace(newton(0.5 ))
收敛性分析
只考虑在函数为凸函数, 且最小值点上 \(f''(x^*) > 0\) 时的收敛速度:
令 \(x_k\) 为第 \(k\) 次迭代后 \(x\) 的值, \(e_{k}:=x_{k}-x^{*}\) 表示 \(x_k\) 到最小值点 \(x^{*}\) 的距离,由 \(f'(x^{*}) = 0\) :
\[0=f^{\prime}\left(x_{k}-e_{k}\right)=f^{\prime}\left(x_{k}\right)-e_{k} f^{\prime \prime}\left(x_{k}\right)+\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) \text{for some } \xi_{k} \in\left[x_{k}-e_{k}, x_{k}\right]
\]
两边除以 \(f''(x_k)\) , 有:
\[e_{k}-f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)=\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
\]
代入更新方程 \(x_{k+1} = x_{k} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)\) , 得到:
\[x_k - x^{*} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right) =\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
\]
\[x_{k+1} - x^{*} = e_{k+1} = \frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
\]
当 \(\frac{1}{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) \leq c\) 时,有:
\[e_{k+1} \leq c e_{k}^{2}
\]
预处理 (Heissan阵辅助梯度下降)
\[\mathbf{x} \leftarrow \mathbf{x}-\eta \operatorname{diag}\left(H_{f}\right)^{-1} \nabla \mathbf{x}
\]
梯度下降与线性搜索(共轭梯度法)
随机梯度下降
随机梯度下降参数更新
对于有 \(n\) 个样本对训练数据集,设 \(f_i(x)\) 是第 \(i\) 个样本的损失函数, 则目标函数为:
\[f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} f_{i}(\mathbf{x})
\]
其梯度为:
\[\nabla f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x})
\]
使用该梯度的一次更新的时间复杂度为 \(\mathcal{O}(n)\)
随机梯度下降更新公式 \(\mathcal{O}(1)\) :
\[\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f_{i}(\mathbf{x})
\]
且有:
\[\mathbb{E}_{i} \nabla f_{i}(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x})=\nabla f(\mathbf{x})
\]
e.g.
\[f(x_1, x_2) = x_1^2 + 2 x_2^2
\]
view code def f (x1, x2 ):
return x1 ** 2 + 2 * x2 ** 2
def gradf (x1, x2 ):
return (2 * x1, 4 * x2)
def sgd (x1, x2 ):
global lr
(g1, g2) = gradf(x1, x2)
(g1, g2) = (g1 + np.random.normal(0.1 ), g2 + np.random.normal(0.1 ))
eta_t = eta * lr()
return (x1 - eta_t * g1, x2 - eta_t * g2)
eta = 0.1
lr = (lambda : 1 )
show_trace_2d(f, train_2d(sgd, steps=50 ))
动态学习率
\[\begin{array}{ll}{\eta(t)=\eta_{i} \text { if } t_{i} \leq t \leq t_{i+1}} & {\text { piecewise constant }} \\ {\eta(t)=\eta_{0} \cdot e^{-\lambda t}} & {\text { exponential }} \\ {\eta(t)=\eta_{0} \cdot(\beta t+1)^{-\alpha}} & {\text { polynomial }}\end{array}
\]
view code def exponential ():
global ctr
ctr += 1
return math.exp(-0.1 * ctr)
ctr = 1
lr = exponential
show_trace_2d(f, train_2d(sgd, steps=1000 ))
view code def polynomial ():
global ctr
ctr += 1
return (1 + 0.1 * ctr)**(-0.5 )
ctr = 1
lr = polynomial
show_trace_2d(f, train_2d(sgd, steps=50 ))
小批量随机梯度下降
读取数据
读取数据
view code def get_data_ch7 ():
data = np.genfromtxt('../../inputs/airfoil_self_noise.dat' , delimiter='\t' )
data = (data - data.mean(axis=0 )) / data.std(axis=0 )
return torch.tensor(data[:1500 , :-1 ], dtype=torch.float32), \
torch.tensor(data[:1500 , -1 ], dtype=torch.float32)
features, labels = get_data_ch7()
features.shape
view code torch.Size ([1500, 5] )
view code import pandas as pd
df = pd.read_csv('../../inputs/airfoil_self_noise.dat' , delimiter='\t' , header=None )
df.head(10 )
0
1
2
3
4
5
0
800
0.0
0.3048
71.3
0.002663
126.201
1
1000
0.0
0.3048
71.3
0.002663
125.201
2
1250
0.0
0.3048
71.3
0.002663
125.951
3
1600
0.0
0.3048
71.3
0.002663
127.591
4
2000
0.0
0.3048
71.3
0.002663
127.461
5
2500
0.0
0.3048
71.3
0.002663
125.571
6
3150
0.0
0.3048
71.3
0.002663
125.201
7
4000
0.0
0.3048
71.3
0.002663
123.061
8
5000
0.0
0.3048
71.3
0.002663
121.301
9
6300
0.0
0.3048
71.3
0.002663
119.541
从零开始实现
view code def linreg (X, w, b ):
return torch.mm(X, w) + b
def squared_loss (y_hat, y ):
return ((y_hat - y.view(y_hat.size())) ** 2 ) / 2
def sgd (params, states, hyperparams ):
for p in params:
p.data -= hyperparams['lr' ] * p.grad.data
view code def train_ch7 (optimizer_fn, states, hyperparams, features, labels,
batch_size=10 , num_epochs=2 ):
net, loss = linreg, squared_loss
w = torch.nn.Parameter(torch.tensor(np.random.normal(0 , 0.01 , size=(features.shape[1 ], 1 )), dtype=torch.float32),
requires_grad=True )
b = torch.nn.Parameter(torch.zeros(1 , dtype=torch.float32), requires_grad=True )
def eval_loss ():
return loss(net(features, w, b), labels).mean().item()
ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True )
for _ in range (num_epochs):
start = time.time()
for batch_i, (X, y) in enumerate (data_iter):
l = loss(net(X, w, b), y).mean()
if w.grad is not None :
w.grad.data.zero_()
b.grad.data.zero_()
l.backward()
optimizer_fn([w, b], states, hyperparams)
if (batch_i + 1 ) * batch_size % 100 == 0 :
ls.append(eval_loss())
print ('loss: %f, %f sec per epoch' % (ls[-1 ], time.time() - start))
plt.plot(np.linspace(0 , num_epochs, len (ls)), ls)
plt.xlabel('epoch' )
plt.ylabel('loss' )
view code def train_sgd (lr, batch_size, num_epochs=2 ):
train_ch7(sgd, None , {'lr' : lr}, features, labels, batch_size, num_epochs)
对比
view code train_sgd(1 , 1500 , 6 )
view code train_sgd(0.005 , 1 )
view code train_sgd(0.05 , 10 )
简洁实现
view code
def train_pytorch_ch7 (optimizer_fn, optimizer_hyperparams, features, labels,
batch_size=10 , num_epochs=2 ):
net = nn.Sequential(
nn.Linear(features.shape[-1 ], 1 )
)
loss = nn.MSELoss()
optimizer = optimizer_fn(net.parameters(), **optimizer_hyperparams)
def eval_loss ():
return loss(net(features).view(-1 ), labels).item() / 2
ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True )
for _ in range (num_epochs):
start = time.time()
for batch_i, (X, y) in enumerate (data_iter):
l = loss(net(X).view(-1 ), y) / 2
optimizer.zero_grad()
l.backward()
optimizer.step()
if (batch_i + 1 ) * batch_size % 100 == 0 :
ls.append(eval_loss())
print ('loss: %f, %f sec per epoch' % (ls[-1 ], time.time() - start))
plt.plot(np.linspace(0 , num_epochs, len (ls)), ls)
plt.xlabel('epoch' )
plt.ylabel('loss' )
view code train_pytorch_ch7(optim.SGD, {"lr" : 0.05 }, features, labels, 10 )
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!