【动手学深度学习】深度学习基础

深度学习基础

本文为李沐老师《动手学深度学习》一书的学习笔记,原书地址为:Dive into Deep Learning

当模型和损失函数形式较为简单时,上面的误差最小化问题的解可以直接用公式表达出来。这类解叫作解析解(analytical solution)。本节使用的线性回归和平方误差刚好属于这个范畴。然而,大多数深度学习模型并没有解析解,只能通过优化算法有限次迭代模型参数来尽可能降低损失函数的值。这类解叫作数值解(numerical solution)。

1 线性回归

1.1 线性回归从零开始实现

生成数据集
%matplotlib inline
import torch
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import random
num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs,
                       dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()),
                       dtype=torch.float32)
# 注意,features的每一行是一个长度为2的向量,而labels的每一行是一个长度为1的向量(标量)。
print(features[0], labels[0])# tensor([ 0.7575, -0.3951]) tensor(7.0511)
def use_svg_display():
    # 用矢量图显示
    display.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)):
    use_svg_display()
    # 设置图的尺寸
    plt.rcParams['figure.figsize'] = figsize

set_figsize()
plt.scatter(features[:, 1].numpy(), labels.numpy(), 1);

在这里插入图片描述

读取数据
# 本函数已保存在d2lzh包中方便以后使用
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # 样本的读取顺序是随机的
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # 最后一次可能不足一个batch
        # index_select()第1个参数是要查找的维度,因为通常情况下我们使用的都是二维张量,所以可以简单的记忆: 0代表行,1代表列
        # index_select()第2个参数是你要索引的序列,它是一个tensor对象
        yield  features.index_select(0, j), labels.index_select(0, j)
batch_size = 10
for X, y in data_iter(batch_size, features, labels):
    print(X, y)
    break
tensor([[-0.8556, -2.1711],
        [-0.6850,  0.2088],
        [-1.1801,  0.5113],
        [ 0.5896, -0.8895],
        [-0.8439,  1.4162],
        [-0.9828, -1.4133],
        [ 0.5438, -0.0274],
        [ 0.7474, -0.0838],
        [ 1.5627, -1.7261],
        [ 0.5781, -0.5606]]) tensor([ 9.8860,  2.1143,  0.0969,  8.4261, -2.2944,  7.0516,  5.3821,  5.9722,
        13.1892,  7.2460])
初始化模型参数
# 我们将权重初始化成均值为0、标准差为0.01的正态随机数,偏差则初始化成0
# np.random.normal()的参数分别为均值、标准差、输出形状
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32)
b = torch.zeros(1, dtype=torch.float32)
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
定义模型
def linreg(X, w, b):  # 定义模型
    return torch.mm(X, w) + b
定义损失函数
def squared_loss(y_hat, y): # 定义损失函数
    # 注意这里返回的是向量, 另外, pytorch里的MSELoss并没有除以 2
    return (y_hat - y.view(y_hat.size())) ** 2 / 2

定义优化算法
def sgd(params, lr, batch_size):  # 定义优化算法
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data
训练模型
lr = 0.03
num_epochs = 3
net = linreg
loss = squared_loss

for epoch in range(num_epochs):  # 训练模型一共需要num_epochs个迭代周期
    # 在每一个迭代周期中,会使用训练数据集中所有样本一次(假设样本数能够被批量大小整除)。X
    # 和y分别是小批量样本的特征和标签
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y).sum()  # l是有关小批量X和y的损失
        l.backward()  # 小批量的损失对模型参数求梯度
        sgd([w, b], lr, batch_size)  # 使用小批量随机梯度下降迭代模型参数

        # 不要忘了梯度清零
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = loss(net(features, w, b), labels)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
print(true_w, '\n', w)
print(true_b, '\n', b)
epoch 1, loss 0.000055
epoch 2, loss 0.000055
epoch 3, loss 0.000055
[2, -3.4] 
 tensor([[ 2.0004],
        [-3.3997]], requires_grad=True)
4.2 
 tensor([4.2002], requires_grad=True)

1.2 线性回归的简洁实现

生成数据集
import torch
import numpy as np

num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2

features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
读取数据
import torch.utils.data as Data

batch_size = 10
# 将训练数据的特征和标签组合
dataset = Data.TensorDataset(features, labels)
# 随机读取小批量
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)

for X, y in data_iter:
    print(X, y)
    break
tensor([[-1.1333,  1.1416],
        [ 0.7153, -0.6338],
        [ 0.6727,  0.9187],
        [-1.2335,  0.0385],
        [-0.4724, -1.2428],
        [-0.8065, -0.1312],
        [-0.2810,  0.2928],
        [ 1.3142,  1.6902],
        [ 0.0569,  0.6979],
        [ 2.2793, -0.8127]]) tensor([-1.9485,  7.7879,  2.4382,  1.5958,  7.4905,  3.0275,  2.6336,  1.0977,
         1.9372, 11.5272])
定义模型
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(n_feature, 1)
    # forward 定义前向传播
    def forward(self, x):
        y = self.linear(x)
        return y

net = LinearNet(num_inputs)
print(net,"\n") # 使用print可以打印出网络的结构

for param in net.parameters():
    print(param)
LinearNet(
  (linear): Linear(in_features=2, out_features=1, bias=True)
) 

Parameter containing:
tensor([[-0.2423, -0.3167]], requires_grad=True)
Parameter containing:
tensor([0.0139], requires_grad=True)

模型的其他定义方法:
用nn.Sequential来更加方便地搭建网络,Sequential是一个有序的容器,网络层将按照在传入Sequential的顺序依次被添加到计算图中。

# 写法一
net = nn.Sequential(
    nn.Linear(num_inputs, 1)
    # 此处还可以传入其他层
    )

# 写法二
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......

# 写法三
from collections import OrderedDict
net = nn.Sequential(OrderedDict([
          ('linear', nn.Linear(num_inputs, 1))
          # ......
        ]))

print(net)
print(net[0])

初始化模型参数
from torch.nn import init

init.normal_(net.linear.weight, mean=0, std=0.01)
init.constant_(net.linear.bias, val=0)  # 也可以直接修改bias的data: net[0].bias.data.fill_(0)
Parameter containing:
tensor([0.], requires_grad=True)
定义损失函数
loss = nn.MSELoss()
定义优化算法
import torch.optim as optim # torch.optim模块提供了很多常用的优化算法比如SGD、Adam和RMSProp等。
# 创建一个用于优化net所有参数的优化器实例,并指定学习率为0.03的小批量随机梯度下降(SGD)为优化算法。
optimizer = optim.SGD(net.parameters(), lr=0.03)
print(optimizer)
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.03
    momentum: 0
    nesterov: False
    weight_decay: 0
)
训练模型
num_epochs = 3
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        output = net(X)
        l = loss(output, y.view(-1, 1))
        optimizer.zero_grad() # 梯度清零,等价于net.zero_grad()
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))
dense = net.linear
print(true_w, dense.weight, "\n")
print(true_b, dense.bias, "\n")
epoch 1, loss: 0.000574
epoch 2, loss: 0.000204
epoch 3, loss: 0.000053
[2, -3.4] Parameter containing:
tensor([[ 2.0003, -3.3994]], requires_grad=True) 

4.2 Parameter containing:
tensor([4.1993], requires_grad=True) 

2 softmax回归

2.1 softmax回归的从零开始实现

获取数据集
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import time
import sys
sys.path.append("..") # 为了导入上层目录的d2lzh_pytorch
mnist_train = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=False, download=True, transform=transforms.ToTensor())
print(type(mnist_train))# <class 'torchvision.datasets.mnist.FashionMNIST'>
print(len(mnist_train), len(mnist_test))# 60000 10000
feature, label = mnist_train[0]
print(feature.shape, label) # torch.Size([1, 28, 28]) 9
def get_fashion_mnist_labels(labels):# 将数值标签转成相应的文本标签
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

def use_svg_display():    # 用矢量图显示
    display.set_matplotlib_formats('svg')

def show_fashion_mnist(images, labels):#可以在一行里画出多张图像和对应标签的函数
    use_svg_display()
    _, figs = plt.subplots(1, len(images), figsize=(12, 12))# 这里的_表示我们忽略(不使用)的变量
    for f, img, lbl in zip(figs, images, labels):
        f.imshow(img.view((28, 28)).numpy())
        f.set_title(lbl)
        f.axes.get_xaxis().set_visible(False)# 隐藏坐标系
        f.axes.get_yaxis().set_visible(False)
    plt.show()

X, y = [], []
for i in range(10):
    X.append(mnist_train[i][0])
    y.append(mnist_train[i][1])
show_fashion_mnist(X, get_fashion_mnist_labels(y))

在这里插入图片描述

def load_data_fashion_mnist(batch_size):
    if sys.platform.startswith('win'):
        num_workers = 0  # 0表示不用额外的进程来加速读取数据
    else:
        num_workers = 4
    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    return train_iter, test_iter

start = time.time()
for X, y in train_iter:
    continue
print('%.2f sec' % (time.time() - start))# 10.30 sec

batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)
初始化模型参数
num_inputs = 28*28
num_outputs = 10

W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs)), dtype=torch.float)
b = torch.zeros(num_outputs, dtype=torch.float)

W.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True) 
实现softmax运算
def softmax(X):# 将输出值变换成值为正且和为1的概率分布
    X_exp = X.exp()
    partition = X_exp.sum(dim=1, keepdim=True)
    return X_exp / partition  # 这里应用了广播机制

X = torch.rand((2, 5))
X_prob = softmax(X)
print(X_prob, X_prob.sum(dim=1))# tensor([[0.3089, 0.1828, 0.1810, 0.1556, 0.1716], [0.1133, 0.2303, 0.1293, 0.2722, 0.2549]]) tensor([1., 1.])
定义模型
def net(X):
    return softmax(torch.mm(X.view(-1, num_inputs), W) + b)
定义损失函数
def cross_entropy(y_hat, y):
    return - torch.log(y_hat.gather(1, y.view(-1, 1)))
定义优化函数
def sgd(params, lr, batch_size):  # 定义优化算法
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data
计算分类准确率
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = torch.LongTensor([0, 2])
print(y_hat.gather(1, y.view(-1, 1)))# tensor([[0.1000], [0.5000]])
print(cross_entropy(y_hat, y))# tensor([[2.3026], [0.6931]])
print(accuracy(y_hat, y))# 0.5
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n
print(evaluate_accuracy(test_iter, net)) #0.1302
模型训练
num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()

            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()

            l.backward()
            if optimizer is None:
                sgd(params, lr, batch_size)
            else:
                optimizer.step()  # “softmax回归的简洁实现”一节将用到


            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
# epoch 1, loss 0.7858, train acc 0.749, test acc 0.793
# epoch 2, loss 0.5722, train acc 0.813, test acc 0.811
# epoch 3, loss 0.5264, train acc 0.824, test acc 0.818
# epoch 4, loss 0.5019, train acc 0.832, test acc 0.826
# epoch 5, loss 0.4856, train acc 0.836, test acc 0.823
X, y = iter(test_iter).next()

true_labels = get_fashion_mnist_labels(y.numpy())
pred_labels = get_fashion_mnist_labels(net(X).argmax(dim=1).numpy())
titles = [true + '\n' + pred for true, pred in zip(true_labels, pred_labels)]

show_fashion_mnist(X[0:9], titles[0:9])

在这里插入图片描述

2.2 softmax回归的简介实现

获取和读取数据
batch_size = 256

mnist_train = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=False, download=True, transform=transforms.ToTensor())

train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
定义和初始化模型
num_inputs = 28*28
num_outputs = 10

class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    def forward(self, x): # x shape: (batch, 1, 28, 28)
        y = self.linear(x.view(x.shape[0], -1))
        return y

net = LinearNet(num_inputs, num_outputs)
class FlattenLayer(nn.Module): # 对x的形状转换
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x): # x shape: (batch, *, *, ...)
        return x.view(x.shape[0], -1)
from collections import OrderedDict

net = nn.Sequential(
    # FlattenLayer(),
    # nn.Linear(num_inputs, num_outputs)
    OrderedDict([
        ('flatten', FlattenLayer()),
        ('linear', nn.Linear(num_inputs, num_outputs))
    ])
)
init.normal_(net.linear.weight, mean=0, std=0.01)
init.constant_(net.linear.bias, val=0) 
softmax和交叉熵损失函数
loss = nn.CrossEntropyLoss()
定义优化算法
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)
训练模型
num_epochs = 5

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()

            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()

            l.backward()
            if optimizer is None:
                sgd(params, lr, batch_size)
            else:
                optimizer.step()  # “softmax回归的简洁实现”一节将用到


            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)
# epoch 1, loss 0.0031, train acc 0.748, test acc 0.791
# epoch 2, loss 0.0022, train acc 0.813, test acc 0.797
# epoch 3, loss 0.0021, train acc 0.825, test acc 0.807
# epoch 4, loss 0.0020, train acc 0.831, test acc 0.818
# epoch 5, loss 0.0019, train acc 0.837, test acc 0.827

3. 其他补充知识

3.1 权证衰减——应对过拟合

带有 L 2 L_2 L2范数惩罚项的新损失函数为: l ( w 1 , w 2 , b ) + λ 2 n ∣ ∣ w ∣ ∣ 2 l(w_1, w_2, b)+\frac{\lambda}{2n}||w||^2 l(w1,w2,b)+2nλ∣∣w2,它的效果是减小w,这也就是权重衰减(weight decay)的由来。

optimizer_w = torch.optim.SGD(params=[net.weight], lr=lr, weight_decay=wd) # 对权重参数衰减,wd为大于1的常数

3.2 丢弃法——应对过拟合

当对该隐藏层使用丢弃法时,该层的隐藏单元将有一定概率被丢弃掉。设丢弃概率为 p p p,那么有 p p p的概率 h i h_i hi会被清零,有 1 − p 1-p 1p的概率 h i h_i hi会除以 1 − p 1-p 1p做拉伸。丢弃概率是丢弃法的超参数。具体来说,设随机变量 ξ i \xi_i ξi为0和1的概率分别为 p p p 1 − p 1-p 1p。由于在训练中隐藏层神经元的丢弃是随机的,即 h 1 , … , h 5 h_1, \ldots, h_5 h1,,h5都有可能被清零,输出层的计算无法过度依赖 h 1 , … , h 5 h_1, \ldots, h_5 h1,,h5中的任一个,从而在训练模型时起到正则化的作用,并可以用来应对过拟合。在测试模型时,我们为了拿到更加确定性的结果,一般不使用丢弃法。

posted @ 2022-01-24 12:34  ccql  阅读(9)  评论(0编辑  收藏  举报  来源