Pytorch 3.4.4 Softmax 基础知识

Softmax 基础

1.计算Softmax的导数

定义Softmax函数：\(S_j = \frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}} \quad ∀_j ∈ 1,2,3\cdots,N\)

输入\(a , a∈R^{N*1}\) , 用Softmax映射： \(S(a) = \begin{bmatrix} a_1\\ a_2\\ \cdots \\ a_N \end{bmatrix} -> \begin{bmatrix} S_1\\ S_2\\ \cdots \\ S_N \end{bmatrix} ,S(a)∈R^{N*1}\) 其中的一个元素\(S_j = \frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}}\)

计算导数：（分母布局）

\(\frac{\partial S(a)}{\partial a} = \begin{bmatrix} \frac{\partial S(a_1)}{\partial a_1}& \frac{\partial S(a_2)}{\partial a_1}& \cdots & \frac{\partial S(a_N)}{\partial a_1} \\ \frac{\partial S(a_1)}{\partial a_2}& \frac{\partial S(a_2)}{\partial a_2}& \cdots & \frac{\partial S(a_N)}{\partial a_2} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{\partial S(a_1)}{\partial a_N}& \frac{\partial S(a_2)}{\partial a_N}& \cdots & \frac{\partial S(a_N)}{\partial a_N} \end{bmatrix} = DS ;\) \(\qquad D_jS_i = \frac{\partial S(a_i)}{\partial a_j}\) , \(\qquad DS = \begin{bmatrix} D_1S_1& D_1S_2& \cdots & D_1S_N \\ D_2S_1& D_2S_2& \cdots & D_2S_N \\ \vdots & \vdots & \ddots & \vdots \\ D_NS_1& D_NS_2& \cdots & D_NS_N \end{bmatrix}_{N*N}\)

我们将使用链式法则来计算导数，即对于 \(f(x)=\frac{g(x)}{h(x)} \qquad\) \(\begin{equation*} {f}'(x)=\frac{{g}'(x)h(x)-{h}'(x)g(x)}{[h(x)]^2} \end{equation*}\qquad\) \(\begin{equation*} \begin{split} g_i=e^{a_i} \\ h_i=\sum_{k=1}^{N}e^{a_k} \end{split} \end{equation*}\)

对于 \(g_i\) 只有\(i=j\) 的时候导数才不为0 。对于 \(h_i\) 对于 \(e^{a_j}\) 求导任何时候都为\(e^{a_j}\)

\(i=j\) 的时候，用\(\sum\) 代替 \(\sum_{k=1}^{N}e^{a_k}\) ，简化计算

\[\begin {equation*} \frac{\partial \frac{e^{a_i}}{\sum_{k=1}^{N}e^{a_k}}}{\partial a_j}= \frac{e^{a_i}\sum-e^{a_j}e^{a_i}}{\sum^2 } \end{equation*} \]

\[\begin{equation*} \begin{split} \frac{\partial \frac{e^{a_i}}{\sum_{k=1}^{N}e^{a_k}}}{\partial a_j}= \frac{e^{a_i}\sum-e^{a_j}e^{a_i}}{\sum^2 } \\ =\frac{e^{a_i}}{\sum }\frac{\sum-e^{a_j}}{\sum} \\ =S_i(1-S_j) \end{split} \end{equation*}\]

\(i\neq j\) 的情况

\[\begin{equation*} \begin{split} \frac{\partial \frac{e^{a_i}}{\sum_{k=1}^{N}e^{a_k}}}{\partial a_j}= \frac{0-e^{a_j}e^{a_i}}{\sum^2 } \\ =-\frac{e^{a_j}}{\sum }\frac{e^{a_i}}{\sum} \\ =-S_jS_i \end{split} \end{equation*} \]

总结：\(\begin{equation*} D_jS_i=\left\{\begin{matrix} S_i(1-S_j) & i=j\\ -S_jS_i & i\neq j \end{matrix}\right. \end{equation*}\)

2.Softmax计算过程中遇到的一些问题

我们已经知道\(softmax(x) = \frac{e^x}{\sum_{i=1}^{n}e^{x_i}}\) 那么当我们的\(x = \begin{pmatrix} x_{1} \\ x_{2} \\ \cdots \\ x_{n} \end{pmatrix}\) 的时候，如果x的某个值输入过大的时候有可能因为数字过大导致计算溢出，比如\(x = \begin{pmatrix} x_{1} = 200000 \\ x_{2}=1 \\ x_{n}=10 \end{pmatrix}\) 这种情况，在计算之前我们需要将x的元素都减去x的最大值，在进行softmax的运算：

\[x = \begin{pmatrix} x_{1} = 2 \\ x_{2}=1 \\ x_{n}=5 \end{pmatrix} \qquad e^x = \begin{pmatrix} e^2 \\ e^1 \\ e^5 \end{pmatrix} \qquad softmax(x) = \begin{pmatrix} 0.4466 \\ 0.1712 \\ 0.9362 \end{pmatrix} \]

\[x' = x - 5 = \begin{pmatrix} x_{1} = -3 \\ x_{2}=-4 \\ x_{n}=0 \end{pmatrix} \qquad e^{x'}\begin{pmatrix} e^{-3} \\ e^{-4} \\ e^0 \end{pmatrix} \qquad softmax(x') = \begin{pmatrix} 0.4466 \\ 0.1712 \\ 0.9362 \end{pmatrix} \]

将所有的值都减去最大值并不会改变最后的结果。

参看文章/文献：
Softmax函数及其导数这篇文章十分有深度，可以当做模版复习。

3.用Softmax实现FashionMNIST分类

step1.引入必要的库函数

import torch  
from d2l import torch as d2l 
from torchvision import transforms 
import torchvision
from torch.utils import data

step2.读取数据

def get_dataloader_workers():  #@save
    """使用4个进程来读取数据"""
    return 4

def load_Data(batch_size,resize = None): 
    '''DownLoad The Fashion-MNIST dataset and then load it into memory''' 
    trans = [transforms.ToTensor()] 
    if resize: 
        trans.insert(0, transforms.Resize(resize)) # 将原来图像的短边缩减或者变长成resize，但是长宽比保持不变 
    trans = transforms.Compose(trans) # 将多个transform组合起来使用。我们可以选择其中的一个，就好像我们之前使用的torch.nn.Sequential()时序容器一样，到后续可能会使用的非常对 
    mnist_train = torchvision.datasets.FashionMNIST(root='../data',
                                             train = True,transform = trans , download = True)
    mnist_text = torchvision.datasets.FashionMNIST(root="../data" , 
                                           train = False,transform =trans,download=True) 
    #dset.MNIST(root, train=True, transform=None, target_transform=None, download=False)
    return data.DataLoader(mnist_train,batch_size,shuffle = True,num_workers=get_dataloader_workers()),\
data.DataLoader(mnist_text,batch_size,shuffle = True,num_workers=get_dataloader_workers())
    
batch_size = 256 
train_iter ,test_iter = load_Data(batch_size)

step3.定义Softmax函数和测试已经读取的数据

# 定义softmax操作 
def softmax(X): 
    X_exp = torch.exp(X) 
    denominator = X_exp.sum(dim=1,keepdim = True) # denominator 分母 ,如果没有keepdim的话，denominator就会降维
    return X_exp/denominator 

#测试Softmax函数
X = torch.normal(mean=0,std=0.01,size=(2,5)) 
prob_X = softmax(X)
prob_X,prob_X.sum(dim=1)

Output[1]:(tensor([[0.2001, 0.2007, 0.1989, 0.1999, 0.2004],
         [0.2027, 0.1985, 0.1980, 0.2013, 0.1995]]),
 tensor([1., 1.]))

step4.定义模型和设置偏置和权重

num_inputs = 784
num_outputs = 10

W = torch.normal(0, 0.01, size=(num_inputs, num_outputs), requires_grad=True) # w[784,10] x[n,784]  
b = torch.zeros(num_outputs, requires_grad=True)

# 定义模型： 
def net(X):
    return softmax(torch.matmul(X.reshape((-1,W.shape[0])),W) + b) 

#测试模型 
X = torch.normal(mean=0,std=0.01,size=(2,5)) 
prob_X = softmax(X)
prob_X,prob_X.sum(dim=1)

step5.定义损失函数

#定义损失函数 
def cross_entropy(y_hat, y):
    return - torch.log(y_hat[range(len(y_hat)), y]) # Cross Entropy Loss Functino : H(p,q):-(p)log q,但是由于这里我们定义的p是真实的标签的概率，为p=1 ,所以省略掉了 ，而q是我们预测的概率 

y = torch.tensor([1, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y_hat[[0, 1], y] # 输出的是对应的概率 [0.1,0.5] 

cross_entropy(y_hat, y)

Output[2]:tensor([1.2040, 0.6931])

step6.定义分类精度

def Accuracy(y_hat, y):  #@save
    """计算预测正确的数量"""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1) # 找出每一行的最大值并且赋值给y_hat 
    cmp = y_hat.type(y.dtype) == y   # 因为y_hat 和   y都是一个矩阵，故会变成一个只有0和1的Matrix 
    return float(cmp.type(y.dtype).sum()) 

#y_hat = y_hat.argmax(axis=1)
#y_hat.type(y.dtype) == y  # tensor([False,  True]) 

Accuracy(y_hat, y) / len(y) #0.5

step7.计算在指定数据集上模型的精度

class Accumulator:  #@save
    """在n个变量上累加"""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
def evaluate_accuracy(net, data_iter):  #@save
    """计算在指定数据集上模型的精度"""
    if isinstance(net, torch.nn.Module): # net 返回的是一个经过softmax处理过后的X torch.tensor 
        net.eval()  # 将模型设置为评估模式
    metric = Accumulator(2)  # 正确预测数、预测总数
    with torch.no_grad():
        for X, y in data_iter:
            metric.add(Accuracy(net(X), y), y.numel())  # y.numel()元素个数
    return metric[0] / metric[1]

evaluate_accuracy(net, test_iter) # 0.1594

step8.定义数据可视化类

# 在展示训练函数的实现之前，我们定义一个在动画中绘制数据的实用程序类Animator， 它能够简化本书其余部分的代码。
class Animator:  #@save
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 2.5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: d2l.set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)

step9.定义训练周期

def train_epoch_ch3(net, train_iter, loss, updater):  #@save
    """训练模型一个迭代周期（定义见第3章）"""
    # 将模型设置为训练模式
    if isinstance(net, torch.nn.Module):
        net.train()
    # 训练损失总和、训练准确度总和、样本数
    metric = Accumulator(3)
    for X, y in train_iter:
        # 计算梯度并更新参数
        y_hat = net(X)
        l = loss(y_hat, y)
        if isinstance(updater, torch.optim.Optimizer):
            # 使用PyTorch内置的优化器和损失函数
            updater.zero_grad()
            l.sum().backward()
            updater.step()
        else:
            # 使用定制的优化器和损失函数
            l.sum().backward()
            updater(X.shape[0])
        metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练精度
    return metric[0] / metric[2], metric[1] / metric[2]

step10.定义训练模型

# 正式开始训练模型 
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
    """训练模型（定义见第3章）"""
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                        legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))
    train_loss, train_acc = train_metrics
    assert train_loss < 0.5, train_loss
    assert train_acc <= 1 and train_acc > 0.7, train_acc
    assert test_acc <= 1 and test_acc > 0.7, test_acc

step11.定义更新参数的函数

#定义更新的函数 这里使用我们之前学过的 mini-batch gradience decent  
lr = 0.1
def updater(batch_size):
    return d2l.sgd([W, b], lr, batch_size)
num_epochs = 10
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)

step11.查看预测的结果

#预测  
def predict_ch3(net,text_iter,n=6):
    '''预测标签''' 
    for X,y in test_iter:
        break 
    trues = d2l.get_fashion_mnist_labels(y)
    preds = d2l.get_fashion_mnist_labels(net(X).argmax(axis=1)) 
#     titles = [true+ \n +pred for true,pred in zip(trues,preds)] 
    titles = [true +'\n' + pred for true, pred in zip(trues, preds)]
    d2l.show_images(X[0:n].reshape((n,28,28)) ,1,n,titles)
#     show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
predict_ch3(net, test_iter)

posted on 2021-12-25 23:43 YangShusen' 阅读(420) 评论(0) 编辑收藏举报

刷新页面返回顶部

请叫我杨先生

导航