Pytorch 3.4.4 Softmax 基础知识
Softmax 基础
1.计算Softmax的导数
定义Softmax函数:\(S_j = \frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}} \quad ∀_j ∈ 1,2,3\cdots,N\)
输入\(a , a∈R^{N*1}\) , 用Softmax映射: \(S(a) = \begin{bmatrix} a_1\\ a_2\\ \cdots \\ a_N \end{bmatrix} -> \begin{bmatrix} S_1\\ S_2\\ \cdots \\ S_N \end{bmatrix} ,S(a)∈R^{N*1}\) 其中的一个元素\(S_j = \frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}}\)
计算导数: (分母布局)
\(\frac{\partial S(a)}{\partial a} = \begin{bmatrix} \frac{\partial S(a_1)}{\partial a_1}& \frac{\partial S(a_2)}{\partial a_1}& \cdots & \frac{\partial S(a_N)}{\partial a_1} \\ \frac{\partial S(a_1)}{\partial a_2}& \frac{\partial S(a_2)}{\partial a_2}& \cdots & \frac{\partial S(a_N)}{\partial a_2} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{\partial S(a_1)}{\partial a_N}& \frac{\partial S(a_2)}{\partial a_N}& \cdots & \frac{\partial S(a_N)}{\partial a_N} \end{bmatrix} = DS ;\) \(\qquad D_jS_i = \frac{\partial S(a_i)}{\partial a_j}\) , \(\qquad DS = \begin{bmatrix} D_1S_1& D_1S_2& \cdots & D_1S_N \\ D_2S_1& D_2S_2& \cdots & D_2S_N \\ \vdots & \vdots & \ddots & \vdots \\ D_NS_1& D_NS_2& \cdots & D_NS_N \end{bmatrix}_{N*N}\)
我们将使用链式法则来计算导数,即对于 \(f(x)=\frac{g(x)}{h(x)} \qquad\) \(\begin{equation*} {f}'(x)=\frac{{g}'(x)h(x)-{h}'(x)g(x)}{[h(x)]^2} \end{equation*}\qquad\) \(\begin{equation*} \begin{split} g_i=e^{a_i} \\ h_i=\sum_{k=1}^{N}e^{a_k} \end{split} \end{equation*}\)
对于 \(g_i\) 只有\(i=j\) 的时候导数才不为0 。 对于 \(h_i\) 对于 \(e^{a_j}\) 求导任何时候都为\(e^{a_j}\)
\(i=j\) 的时候,用\(\sum\) 代替 \(\sum_{k=1}^{N}e^{a_k}\) ,简化计算
\(i\neq j\) 的情况
总结:\(\begin{equation*} D_jS_i=\left\{\begin{matrix} S_i(1-S_j) & i=j\\ -S_jS_i & i\neq j \end{matrix}\right. \end{equation*}\)
2.Softmax计算过程中遇到的一些问题
我们已经知道\(softmax(x) = \frac{e^x}{\sum_{i=1}^{n}e^{x_i}}\) 那么当我们的\(x = \begin{pmatrix} x_{1} \\ x_{2} \\ \cdots \\ x_{n} \end{pmatrix}\) 的时候,如果x的某个值输入过大的时候有可能因为数字过大导致计算溢出,比如\(x = \begin{pmatrix} x_{1} = 200000 \\ x_{2}=1 \\ x_{n}=10 \end{pmatrix}\) 这种情况,在计算之前我们需要将x的元素都减去x的最大值,在进行softmax的运算:
将所有的值都减去最大值并不会改变最后的结果。
参看文章/文献:
Softmax函数及其导数 这篇文章十分有深度,可以当做模版复习。
3.用Softmax实现FashionMNIST分类
step1.引入必要的库函数
import torch
from d2l import torch as d2l
from torchvision import transforms
import torchvision
from torch.utils import data
step2.读取数据
def get_dataloader_workers(): #@save
"""使用4个进程来读取数据"""
return 4
def load_Data(batch_size,resize = None):
'''DownLoad The Fashion-MNIST dataset and then load it into memory'''
trans = [transforms.ToTensor()]
if resize:
trans.insert(0, transforms.Resize(resize)) # 将原来图像的短边缩减或者变长成resize,但是长宽比保持不变
trans = transforms.Compose(trans) # 将多个transform组合起来使用。我们可以选择其中的一个,就好像我们之前使用的torch.nn.Sequential()时序容器一样,到后续可能会使用的非常对
mnist_train = torchvision.datasets.FashionMNIST(root='../data',
train = True,transform = trans , download = True)
mnist_text = torchvision.datasets.FashionMNIST(root="../data" ,
train = False,transform =trans,download=True)
#dset.MNIST(root, train=True, transform=None, target_transform=None, download=False)
return data.DataLoader(mnist_train,batch_size,shuffle = True,num_workers=get_dataloader_workers()),\
data.DataLoader(mnist_text,batch_size,shuffle = True,num_workers=get_dataloader_workers())
batch_size = 256
train_iter ,test_iter = load_Data(batch_size)
step3.定义Softmax函数和测试已经读取的数据
# 定义softmax操作
def softmax(X):
X_exp = torch.exp(X)
denominator = X_exp.sum(dim=1,keepdim = True) # denominator 分母 ,如果没有keepdim的话,denominator就会降维
return X_exp/denominator
#测试Softmax函数
X = torch.normal(mean=0,std=0.01,size=(2,5))
prob_X = softmax(X)
prob_X,prob_X.sum(dim=1)
Output[1]:(tensor([[0.2001, 0.2007, 0.1989, 0.1999, 0.2004],
[0.2027, 0.1985, 0.1980, 0.2013, 0.1995]]),
tensor([1., 1.]))
step4.定义模型和设置偏置和权重
num_inputs = 784
num_outputs = 10
W = torch.normal(0, 0.01, size=(num_inputs, num_outputs), requires_grad=True) # w[784,10] x[n,784]
b = torch.zeros(num_outputs, requires_grad=True)
# 定义模型:
def net(X):
return softmax(torch.matmul(X.reshape((-1,W.shape[0])),W) + b)
#测试模型
X = torch.normal(mean=0,std=0.01,size=(2,5))
prob_X = softmax(X)
prob_X,prob_X.sum(dim=1)
step5.定义损失函数
#定义损失函数
def cross_entropy(y_hat, y):
return - torch.log(y_hat[range(len(y_hat)), y]) # Cross Entropy Loss Functino : H(p,q):-(p)log q,但是由于这里我们定义的p是真实的标签的概率,为p=1 ,所以省略掉了 ,而q是我们预测的概率
y = torch.tensor([1, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y_hat[[0, 1], y] # 输出的是对应的概率 [0.1,0.5]
cross_entropy(y_hat, y)
Output[2]:tensor([1.2040, 0.6931])
step6.定义分类精度
def Accuracy(y_hat, y): #@save
"""计算预测正确的数量"""
if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
y_hat = y_hat.argmax(axis=1) # 找出每一行的最大值并且赋值给y_hat
cmp = y_hat.type(y.dtype) == y # 因为y_hat 和 y都是一个矩阵,故会变成一个只有0和1的Matrix
return float(cmp.type(y.dtype).sum())
#y_hat = y_hat.argmax(axis=1)
#y_hat.type(y.dtype) == y # tensor([False, True])
Accuracy(y_hat, y) / len(y) #0.5
step7.计算在指定数据集上模型的精度
class Accumulator: #@save
"""在n个变量上累加"""
def __init__(self, n):
self.data = [0.0] * n
def add(self, *args):
self.data = [a + float(b) for a, b in zip(self.data, args)]
def reset(self):
self.data = [0.0] * len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def evaluate_accuracy(net, data_iter): #@save
"""计算在指定数据集上模型的精度"""
if isinstance(net, torch.nn.Module): # net 返回的是一个经过softmax处理过后的X torch.tensor
net.eval() # 将模型设置为评估模式
metric = Accumulator(2) # 正确预测数、预测总数
with torch.no_grad():
for X, y in data_iter:
metric.add(Accuracy(net(X), y), y.numel()) # y.numel()元素个数
return metric[0] / metric[1]
evaluate_accuracy(net, test_iter) # 0.1594
step8.定义数据可视化类
# 在展示训练函数的实现之前,我们定义一个在动画中绘制数据的实用程序类Animator, 它能够简化本书其余部分的代码。
class Animator: #@save
"""在动画中绘制数据"""
def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
ylim=None, xscale='linear', yscale='linear',
fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
figsize=(3.5, 2.5)):
# 增量地绘制多条线
if legend is None:
legend = []
d2l.use_svg_display()
self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
if nrows * ncols == 1:
self.axes = [self.axes, ]
# 使用lambda函数捕获参数
self.config_axes = lambda: d2l.set_axes(
self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
self.X, self.Y, self.fmts = None, None, fmts
def add(self, x, y):
# 向图表中添加多个数据点
if not hasattr(y, "__len__"):
y = [y]
n = len(y)
if not hasattr(x, "__len__"):
x = [x] * n
if not self.X:
self.X = [[] for _ in range(n)]
if not self.Y:
self.Y = [[] for _ in range(n)]
for i, (a, b) in enumerate(zip(x, y)):
if a is not None and b is not None:
self.X[i].append(a)
self.Y[i].append(b)
self.axes[0].cla()
for x, y, fmt in zip(self.X, self.Y, self.fmts):
self.axes[0].plot(x, y, fmt)
self.config_axes()
display.display(self.fig)
display.clear_output(wait=True)
step9.定义训练周期
def train_epoch_ch3(net, train_iter, loss, updater): #@save
"""训练模型一个迭代周期(定义见第3章)"""
# 将模型设置为训练模式
if isinstance(net, torch.nn.Module):
net.train()
# 训练损失总和、训练准确度总和、样本数
metric = Accumulator(3)
for X, y in train_iter:
# 计算梯度并更新参数
y_hat = net(X)
l = loss(y_hat, y)
if isinstance(updater, torch.optim.Optimizer):
# 使用PyTorch内置的优化器和损失函数
updater.zero_grad()
l.sum().backward()
updater.step()
else:
# 使用定制的优化器和损失函数
l.sum().backward()
updater(X.shape[0])
metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
# 返回训练损失和训练精度
return metric[0] / metric[2], metric[1] / metric[2]
step10.定义训练模型
# 正式开始训练模型
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): #@save
"""训练模型(定义见第3章)"""
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
legend=['train loss', 'train acc', 'test acc'])
for epoch in range(num_epochs):
train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
test_acc = evaluate_accuracy(net, test_iter)
animator.add(epoch + 1, train_metrics + (test_acc,))
train_loss, train_acc = train_metrics
assert train_loss < 0.5, train_loss
assert train_acc <= 1 and train_acc > 0.7, train_acc
assert test_acc <= 1 and test_acc > 0.7, test_acc
step11.定义更新参数的函数
#定义更新的函数 这里使用我们之前学过的 mini-batch gradience decent
lr = 0.1
def updater(batch_size):
return d2l.sgd([W, b], lr, batch_size)
num_epochs = 10
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)
step11.查看预测的结果
#预测
def predict_ch3(net,text_iter,n=6):
'''预测标签'''
for X,y in test_iter:
break
trues = d2l.get_fashion_mnist_labels(y)
preds = d2l.get_fashion_mnist_labels(net(X).argmax(axis=1))
# titles = [true+ \n +pred for true,pred in zip(trues,preds)]
titles = [true +'\n' + pred for true, pred in zip(trues, preds)]
d2l.show_images(X[0:n].reshape((n,28,28)) ,1,n,titles)
# show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
predict_ch3(net, test_iter)
posted on 2021-12-25 23:43 YangShusen' 阅读(420) 评论(0) 编辑 收藏 举报