第二周作业:多层感知机
一、理论知识学习
(一)、线性回归
1、简单概念
(1)线性回归是对n维输入的加权和,外加偏差
(2)可使用平方损失来衡量预测值和真实值的差异
(3)线性回归有显示解
(4)线性回归可以看做是单层神经网络
2、基础优化算法
梯度下降:Wt=Wt-1-学习率*损失函数在Wt-1的梯度,学习率是步长的超参数
小批量梯度下降:随机采样b个样本来近似损失,b是批量大小
梯度下降通过不断沿着反梯度方向更新参数求解
(二)softmax回归:多类分类模型
1、从回归到多类分类
(1)均方损失
首先是独热编码,保证1位有效:
最大值作为预测
(2)无校验比例:需要更置信的识别正确类
(3)校验比例
(4)交叉熵损失:衡量两个概率的区别
(5)梯度:
2、损失函数
(1)均方损失L2 Loss:
(2)绝对值损失L1 Loss:
(3)Huber's Robust Loss
3、具体实现(复杂版)
1 import torch 2 from IPython import display 3 #pip install d2l 4 from d2l import torch as d2l 5 6 batch_size = 256 7 train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size) 8 9 num_inputs = 784#展平图像为28*28=784 10 num_outputs = 10#10个类别,输出维度为10 11 12 W = torch.normal(0,0.01,size=(num_inputs,num_outputs),requires_grad=True) 13 b = torch.zeros(num_outputs,requires_grad=True) 14 15 #定义softmax 16 X = torch.tensor([[1.0,2.0,3.0],[4.0,5.0,6.0]]) 17 X.sum(0,keepdim=True),X.sum(1,keepdim=True) 18 #(tensor([[5., 7., 9.]]), tensor([[ 6.],[15.]])) 19 20 def softmax(X): 21 X_exp = torch.exp(X) 22 partition = X_exp.sum(1,keepdim = True)#对每一行进行求和 23 return X_exp / partition#广播机制,对第i行除以partition的第i个元素 24 25 X = torch.normal(0, 1, (2,5)) 26 X_prob = softmax(X) 27 X_prob,X_prob.sum(1) 28 #(tensor([[0.1276, 0.1159, 0.4678, 0.0687, 0.2200], 29 # [0.0985, 0.2689, 0.1220, 0.2625, 0.2481]]), tensor([1.0000, 1.0000])) 30 31 def net(X): 32 return softmax(torch.matmul(X.reshape((-1,W.shape[0])),W)+b)#W.shape[0]=784,batch_size=256,所以X的size为256*784 33 #reshape括号里的括号值得是一个元组,是单个变量 34 35 #创建数据 36 y = torch.tensor([0,2])#两个真实的标号 37 y_hat = torch.tensor([[0.1,0.3,0.6],[0.3,0.2,0.5]])#预测值 38 y_hat[[0,1],y]#[0,1]是取axis,即外面的行;[0,2]是取axis1的下标 39 #[0,1],y 对于第0样本,把对应标号的预测值y0;对于第1样本,拿出y1下标对应的输出。y_hat的[0,0][1,2] 40 41 #定义交叉熵损失函数 42 def cross_entropy(y_hat,y): 43 return -torch.log(y_hat[range(len(y_hat)),y])#range(len(y_hat))=y_hat.shape[0],提取矩阵行数 44 cross_entropy(y_hat,y)#y_hat 2*3 y 2 45 #tensor([2.3026, 0.6931])分别是样本0和样本1的损失 46 47 def accuracy(y_hat,y): #计算预测正确的个数 48 if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:#shape大于1且列数大于1 49 y_hat = y_hat.argmax(axis=1)#最大数的下标存到y_hat里 50 cmp = y_hat.type(y.dtype) == y#将y_hat的数据类型转成y的数据类型,cmp是true/false 51 return float(cmp.type(y.dtype).sum())#cmp转换成0/1 52 accuracy(y_hat,y) / len(y)#预测正确个数除以y的个数是预测正确的概率 0.5 53 54 55 #评估准确率 56 def evaluate_accuracy(net,data_iter): 57 if isinstance(net,torch.nn.Module):#判断是否是torch.nn的模型类别,是的话将模型设置为评估模式 58 net.eval()#评估模式 59 metric = Accumulator(2)#正确预测数、预测总数 60 for X,y in data_iter: 61 metric.add(accuracy(net(X),y),y.numel())#y.numel()样本总数 62 return metric[0] / metric[1]#分类正确样本数/总样本数 63 64 #在n个变量上累加,创建了两个变量 65 class Accumulator: 66 def __init__(self, n): 67 self.data = [0.0] * n 68 69 def add(self, *args): 70 self.data = [a + float(b) for a, b in zip(self.data, args)] 71 72 def reset(self): 73 self.data = [0.0] * len(self.data) 74 75 def __getitem__(self, idx): 76 return self.data[idx] 77 evaluate_accuracy(net, test_iter)#0.118 这个数是随机的 78 79 def train_epoch_ch3(net,train_iter,loss,updater): 80 if isinstance(net,torch.nn.Module): 81 net.train() 82 metric = Accumulator(3)#长度为3的迭代器 83 for X,y in train_iter: 84 y_hat = net(X) 85 l = loss(y_hat, y) 86 if isinstance(updater, torch.optim.Optimizer): 87 updater.zero_grad() 88 l.backward() 89 updater.step() 90 metric.add(float(l)*len(y),accuracy(y_hat,y),y.size().numel())#记录分类的正确的个数 91 else: 92 l.sum().backward()#如果是自定义,算出来的是向量,需要求和 93 updater(X.shape[0])#根据批量大小update 94 metric.add(float(l.sum()),accuracy(y_hat,y),y.numel()) 95 return metric[0] / metric[2],metric[1] / metric[2]#loss的累加除以总样本数、分类正确数除以总样本数 96 97 class Animator: 98 """在动画中绘制数据。""" 99 def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, 100 ylim=None, xscale='linear', yscale='linear', 101 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, 102 figsize=(3.5, 2.5)): 103 if legend is None: 104 legend = [] 105 d2l.use_svg_display() 106 self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) 107 if nrows * ncols == 1: 108 self.axes = [self.axes,] 109 self.config_axes = lambda: d2l.set_axes(self.axes[ 110 0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) 111 self.X, self.Y, self.fmts = None, None, fmts 112 113 def add(self, x, y): 114 if not hasattr(y, "__len__"): 115 y = [y] 116 n = len(y) 117 if not hasattr(x, "__len__"): 118 x = [x] * n 119 if not self.X: 120 self.X = [[] for _ in range(n)] 121 if not self.Y: 122 self.Y = [[] for _ in range(n)] 123 for i, (a, b) in enumerate(zip(x, y)): 124 if a is not None and b is not None: 125 self.X[i].append(a) 126 self.Y[i].append(b) 127 self.axes[0].cla() 128 for x, y, fmt in zip(self.X, self.Y, self.fmts): 129 self.axes[0].plot(x, y, fmt) 130 self.config_axes() 131 display.display(self.fig) 132 display.clear_output(wait=True) 133 134 def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): 135 """训练模型(定义见第3章)。""" 136 animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], 137 legend=['train loss', 'train acc', 'test acc']) 138 for epoch in range(num_epochs): 139 train_metrics = train_epoch_ch3(net, train_iter, loss, updater) 140 test_acc = evaluate_accuracy(net, test_iter) 141 animator.add(epoch + 1, train_metrics + (test_acc,)) 142 train_loss, train_acc = train_metrics 143 assert train_loss < 0.5, train_loss 144 assert train_acc <= 1 and train_acc > 0.7, train_acc 145 assert test_acc <= 1 and test_acc > 0.7, test_acc 146 147 lr = 0.1 148 149 def updater(batch_size): 150 return d2l.sgd([W, b], lr, batch_size)#实现sgd 151 152 num_epochs = 10 153 train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)
损失函数和精度的图片如下所示:
1 def predict_ch3(net, test_iter, n=6): 2 """预测标签(定义见第3章)。""" 3 for X, y in test_iter: 4 break 5 trues = d2l.get_fashion_mnist_labels(y) 6 preds = d2l.get_fashion_mnist_labels(net(X).argmax(axis=1)) 7 titles = [true + '\n' + pred for true, pred in zip(trues, preds)] 8 d2l.show_images(X[0:n].reshape((n, 28, 28)), 1, n, titles=titles[0:n]) 9 10 predict_ch3(net, test_iter)
对图像进行分类预测:
4、具体实现(简单版)
1 #简洁实现 2 import torch 3 from torch import nn 4 from d2l import torch as d2l 5 6 batch_size = 256 7 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) 8 9 net = nn.Sequential(nn.Flatten(), nn.Linear(784, 10))#flatten表示任何维度的tensor保持第0维度,其他展成向量 10 11 def init_weights(m):#m是当前layer 12 if type(m) == nn.Linear: 13 nn.init.normal_(m.weight, std=0.01)#默认为0,方差为0.01 14 15 net.apply(init_weights);#将这个函数apply到net里 16 #交叉熵损失函数中传递未归一化的预测,并同时计算softmax及其对数 17 loss = nn.CrossEntropyLoss() 18 #使用学习率为0.1的小批量随机梯度下降作为优化算法 19 trainer = torch.optim.SGD(net.parameters(), lr=0.1) 20 21 num_epochs = 10 22 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
损失函数和正确率结果图如下:
(三)感知机
1、感知机概念
给定输入x,权重w,偏移b,感知机输出:
感知机是个二分类问题,1或0,1或-1,如果两者异号预测错误。
损失函数:分类错误则后面一项为正,损失函数不为0,要进入梯度下降更新
感知机不能拟合XOR函数(异或),只能产生线性分割面
2、多层感知机
以XOR函数为例,用两个感知机组合实现多层感知机。超参数为隐藏层数和各个隐藏层大小。
常用的激活函数:
(1)Sigmoid激活函数
(2)Tanh激活函数
(3)ReLU激活函数
3、代码实现
1 #多层感知机 2 import torch 3 from torch import nn 4 from d2l import torch as d2l 5 6 batch_size = 256 7 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) 8 9 num_inputs,num_outputs,num_hiddens = 784,10,256#包含了256个隐藏单元 10 #单隐藏层的多层感知机 11 W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad=True) * 0.01) 12 b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))#偏差为0 13 W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad=True) * 0.01) 14 b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True)) 15 16 params = [W1, b1, W2, b2]#第一层和第二层 17 18 #ReLU激活函数 19 def relu(X): 20 a = torch.zeros_like(X)#数据类型和形状一样 元素值为0 21 return torch.max(X, a) 22 #定义网络 23 def net(X): 24 X = X.reshape((-1, num_inputs))#28*28拉成784 25 H = relu(X @ W1 + b1)#@是矩阵乘法 26 return (H @ W2 + b2) 27 #交叉熵损失函数 28 loss = nn.CrossEntropyLoss() 29 #训练过程 30 num_epochs, lr = 10, 0.1 31 updater = torch.optim.SGD(params, lr=lr) 32 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)
结果图如下所示:
1 #简洁实现 2 import torch 3 from torch import nn 4 from d2l import torch as d2l 5 6 net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))#加了relu激活函数 7 8 def init_weights(m): 9 if type(m) == nn.Linear: 10 nn.init.normal_(m.weight, std=0.01) 11 12 net.apply(init_weights); 13 14 batch_size, lr, num_epochs = 256, 0.1, 10 15 loss = nn.CrossEntropyLoss() 16 trainer = torch.optim.SGD(net.parameters(), lr=lr) 17 18 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) 19 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
结果图如下所示:
(四)模型选择
1、训练误差:模型在训练数据上的误差
泛化误差:模型在新数据上的误差
验证数据集:一个用来评估模型好坏的数据集
测试数据集:只用一次的数据集
K-折交叉验证(数据不够时使用):将训练数据分割成K块,循环使用第i块作为验证数据集,其余作为训练数据集,报告K个验证集误差的平均
2、过拟合和欠拟合
模型容量:拟合各种函数的能力
低容量的模型难以拟合训练数据,高容量的模型可以记住所有的训练数据
VC维衡量训练误差和繁华误差的间隔
3、代码模拟
import math import numpy as np import torch from torch import nn #!pip install d2l from d2l import torch as d2l max_degree = 20#特征为20 n_train, n_test = 100, 100 true_w = np.zeros(max_degree) true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])#剩下的为0,是噪音项 features = np.random.normal(size=(n_train + n_test, 1)) np.random.shuffle(features)#打乱顺序 poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))#计算features的max_degree次方 for i in range(max_degree): poly_features[:, i] /= math.gamma(i + 1)#计算函数中传递的数字的伽玛值。 labels = np.dot(poly_features, true_w)#矩阵乘法、点积 labels += np.random.normal(scale=0.1, size=labels.shape) true_w, features, poly_features, labels = [torch.tensor(x, dtype=torch.float32) for x in [true_w, features, poly_features, labels]] features[:2], poly_features[:2, :], labels[:2] def evaluate_loss(net, data_iter, loss): """评估给定数据集上模型的损失。""" metric = d2l.Accumulator(2) for X, y in data_iter: out = net(X) y = y.reshape(out.shape) l = loss(out, y) metric.add(l.sum(), l.numel()) return metric[0] / metric[1] def train(train_features, test_features, train_labels, test_labels,num_epochs=400): loss = nn.MSELoss() input_shape = train_features.shape[-1] net = nn.Sequential(nn.Linear(input_shape, 1, bias=False)) batch_size = min(10, train_labels.shape[0]) train_iter = d2l.load_array((train_features, train_labels.reshape(-1, 1)),batch_size) test_iter = d2l.load_array((test_features, test_labels.reshape(-1, 1)),batch_size, is_train=False) trainer = torch.optim.SGD(net.parameters(), lr=0.01) animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log',xlim=[1, num_epochs], ylim=[1e-3, 1e2],legend=['train', 'test']) for epoch in range(num_epochs): d2l.train_epoch_ch3(net, train_iter, loss, trainer) if epoch == 0 or (epoch + 1) % 20 == 0: animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss), evaluate_loss(net, test_iter, loss))) print('weight:', net[0].weight.data.numpy()) train(poly_features[:n_train, :4], poly_features[n_train:, :4], labels[:n_train], labels[n_train:]) train(poly_features[:n_train, :2], poly_features[n_train:, :2],#欠拟合,只使用了前两列的数据 labels[:n_train], labels[n_train:]) train(poly_features[:n_train, :], poly_features[n_train:, :],#过拟合,用了所有的列,包括噪音 labels[:n_train], labels[n_train:], num_epochs=1500)
训练结果和测试结果图:
欠拟合结果图:
过拟合结果图:
(五)权重衰退:常用的处理过拟合的方法
1、正则化惩罚
均方范数:通过限制参数值的选择范围控制模型容量,通常不限制偏移b
权重衰退的理解:
2、代码实现
1 %matplotlib inline 2 import torch 3 from torch import nn 4 from d2l import torch as d2l 5 6 n_train, n_test, num_inputs, batch_size = 20, 100, 200, 5 7 true_w, true_b = torch.ones((num_inputs, 1)) * 0.01, 0.05 8 train_data = d2l.synthetic_data(true_w, true_b, n_train) 9 train_iter = d2l.load_array(train_data, batch_size) 10 test_data = d2l.synthetic_data(true_w, true_b, n_test) 11 test_iter = d2l.load_array(test_data, batch_size, is_train=False) 12 #初始化模型参数 13 def init_params(): 14 w = torch.normal(0, 1, size=(num_inputs, 1), requires_grad=True) 15 b = torch.zeros(1, requires_grad=True) 16 return [w, b] 17 #定义L2范数 18 def l2_penalty(w): 19 return torch.sum(w.pow(2)) / 2 20 21 def train(lambd): 22 w, b = init_params() 23 net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss#线性回归、平方损失函数 24 num_epochs, lr = 100, 0.003 25 animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log', 26 xlim=[5, num_epochs], legend=['train', 'test']) 27 for epoch in range(num_epochs):#迭代次数 28 for X, y in train_iter:#迭代器 29 #with torch.enable_grad(): 30 l = loss(net(X), y) + lambd * l2_penalty(w) 31 l.sum().backward() 32 d2l.sgd([w, b], lr, batch_size) 33 if (epoch + 1) % 5 == 0: 34 animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss), 35 d2l.evaluate_loss(net, test_iter, loss))) 36 print('w的L2范数是:', torch.norm(w).item()) 37 38 train(lambd=0) 39 train(lambd=3)
当lambd为0时忽略正则化会产生过拟合:
3、Dropout丢弃法:在层之间加入噪音,随机置0,避免过拟合,通常用在隐藏全连接层的输出上,丢弃概率是控制模型复杂度的超参数
dropout是个正则项,正则项只在训练中使用。
4、代码实现
1 import torch 2 from torch import nn 3 from d2l import torch as d2l 4 5 def dropout_layer(X, dropout): 6 assert 0 <= dropout <= 1#定义dropout的范围 7 if dropout == 1: 8 return torch.zeros_like(X)#如果deopout=1,输出为0 9 if dropout == 0: 10 return X#如果deopout=0,输出为X 11 mask = (torch.randn(X.shape) > dropout).float()#mask是用于随机生成0和1,选择X中哪些数据需要丢弃的 12 return mask * X / (1.0 - dropout) 13 14 X = torch.arange(16, dtype=torch.float32).reshape((2, 8)) 15 print(X) 16 print(dropout_layer(X, 0.)) 17 print(dropout_layer(X, 0.5)) 18 print(dropout_layer(X, 1.)) 19 20 # tensor([[ 0., 1., 2., 3., 4., 5., 6., 7.], 21 # [ 8., 9., 10., 11., 12., 13., 14., 15.]]) 22 # tensor([[ 0., 1., 2., 3., 4., 5., 6., 7.], 23 # [ 8., 9., 10., 11., 12., 13., 14., 15.]]) 24 # tensor([[ 0., 0., 0., 6., 0., 0., 0., 0.], 25 # [16., 18., 0., 0., 24., 0., 0., 0.]]) 26 # tensor([[0., 0., 0., 0., 0., 0., 0., 0.], 27 # [0., 0., 0., 0., 0., 0., 0., 0.]]) 28 29 30 #定义有两个隐藏层的多层感知机,每个隐藏层包含256个单元 31 num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256 32 dropout1, dropout2 = 0.2, 0.5 33 class Net(nn.Module): 34 def __init__(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2,is_training=True):#训练 35 super(Net, self).__init__() 36 self.num_inputs = num_inputs 37 self.training = is_training 38 self.lin1 = nn.Linear(num_inputs, num_hiddens1) 39 self.lin2 = nn.Linear(num_hiddens1, num_hiddens2) 40 self.lin3 = nn.Linear(num_hiddens2, num_outputs) 41 self.relu = nn.ReLU() 42 43 def forward(self, X): 44 H1 = self.relu(self.lin1(X.reshape((-1, self.num_inputs))))#第一个隐藏层 45 if self.training == True: 46 H1 = dropout_layer(H1, dropout1)#在训练就dropout 47 H2 = self.relu(self.lin2(H1))#第二个隐藏层 48 if self.training == True: 49 H2 = dropout_layer(H2, dropout2) 50 out = self.lin3(H2)#输出层 51 return out 52 53 net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2) 54 55 num_epochs, lr, batch_size = 10, 0.5, 256 56 loss = nn.CrossEntropyLoss() 57 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) 58 trainer = torch.optim.SGD(net.parameters(), lr=lr) 59 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
(六)数值稳定性
1、数值稳定性常见的两个问题:梯度爆炸和梯度消失
梯度爆炸的问题(ReLu易导致):值超出值域、对学习率敏感
梯度消失的问题(Sigmoid易导致):梯度值变为0、训练无进展、对底部层影响更大
2、模型初始化
为了让梯度值在合理范围内,可以将乘法变成加法(ResNet,LSTM)、归一化或合理的权重初始化和激活函数
Xavier初始:
第一个条件使得前向输出的方向是一致的,第二个条件是梯度是一致的
激活函数:让x趋于0时,激活函数的泰勒展开结果为0,sigmoid不符合需要调整
二、问题和收获
1、问题:复杂版和简单版的损失函数曲线在为0的初始点不同?
2、问题:应用了dropout之后loss略有增加,测试的正确率有波动
3、收获
又学习到了很多新的知识,逐渐锻炼代码实操能力,QA环节能够很好地解决我大部分疑惑。数值稳定性模块太难了,数学基础差的我看的迷迷糊糊,混混沌沌。今天真是高质量码农的一天。