pytorch | A 60 MINUTE BLITZ 代码+详细注释
代码搬运自戳我
同时感谢提供了详细学习笔记的maomao9173神仙
PART1
from __future__ import print_function
import torch
x = torch.empty(5, 3)
# empty -> 未初始化,随机值
x = torch.rand(5, 3)
# rand -> [0, 1] 均匀随机
x = torch.randn(5, 3)
# randn -> [0, 1] 正态分布
x = torch.zeros(5, 3, dtype = torch.long)
# zeros -> 全 0
x = torch.tensor([5.5, 3])
# 直接用数据构造 tensor
x = x.new_ones(5, 3, dtype = torch.double)
# new_ones : 获得一个n.m的张量,元素均为1,属性继承于张量x
x = torch.randn_like(x, dtype = torch.float)
# randn_like : 随机数
print(x.size())
# 展示张量的形状
y = torch.rand(5, 3)
print(x + y)
print(torch.add(x, y))
# 逐位相加,上面两个语句结果相同
result = torch.empty(5, 3)
torch.add(x, y, out = result)
# 把 x 和 y 相加的结果存到 result 里面
y.add_(x)
y.copy_(x)
# 把 x 加到 y 上去 / 把 x 拷到 y 里面去
# 常见语法 : 带有"_"后缀的函数一般都是把结果存在当前变量里
print(x[:, 1])
# 遍历行的第一列输出
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)
# view 表示将原矩阵改变性状(长宽比)
# z 的第一维度尺寸会自动推断
x = torch.randn(1)
print(x)
print(x.item())
# 单元素tensor通过item获得其包含的值
a = torch.ones(5)
b = a.numpy()
# tensor <-> numpy : 轻而易举
a.add_(1)
print(a)
print(b)
# 如果torch tensor在gpu上跑,那么把tensor给numpy
# 和把numpy给tensor时,他们会占用同样的内存空间。
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out = a)
print(a)
print(b)
# All the tensors on the CPU except a CharTensor
# support converting to Numpy and back
if torch.cuda.is_available():
device = torch.device("cuda")
y = torch.ones_like(x, device = device)
# ones_like : 返回一个用1填充的 和输入形状与类型一致的数组
print(x)
print(y)
# 通过这种方法,直接创建一个gpu上的tensor
# x = x.to(device)
# z = x + y
print(z)
print(z.to("cpu", torch.double))
# .to还可以同时改变dtype!
PART2
import torch
import numpy as np
'''
torch.Tensor是autograd包里的核心类。如果把
[.requires_grad]设置为True,它就会跟踪每一
个操作,并在完成其他计算后,执行[.backward]
进行反向传播,自动计算梯度。它所得到的结果会
被累加到[.grad]属性中。
'''
# 使用[.detach()]来阻止它自动进行跟踪历史。
# 也可以使用[with torch.no_grad()]
# 避免它自动跟踪历史并占用空间。
# .grad_fn: 是创建这个tensor的function
x = torch.ones(2, 2, requires_grad = True)
print(x)
y = x + 2
print(y)
# y是通过一个操作创造的tensor而不是用户自行
# 所以它有grad_fn
print(y.grad_fn)
z = y * y * 3
out = z.mean()
print(z, out)
# mean函数求平均值
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)
# .requires_grad_改变requires_grad变量
# requires_grad默认设置为False
out.backward() # 进行反向传播
print(x.grad_fn)
x = x * x
x.backward()
print(x.grad)
x = torch.randn(3, requires_grad = True)
y = x * 2
while y.data.norm() < 1000:
y = y * 2
# .data.norm()表示对数组里面(类型为float)所有值求平方和,然后开方
print(y)
# ones : 全1
# zeros: 全0
PART3
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 哇!又不会啦!
# 1 input image channel, 6 output channels, 3x3 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 3)
self.conv2 = nn.Conv2d(6, 16, 3)
# 输入通道数 / 输出通道数 / 卷积核大小
# 在pytorch中,定义层会默认使用HE初始化
self.fc1 = nn.Linear(16 * 6 * 6, 120)
# 输入张量大小 / 输出张量大小
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
# an affine operation: y = Wx + b
def forward(self, x):
# Max polling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# if the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), (2,3))
x = x.view(-1, self.num_flat_features(x))
# num_flat_features把之前的张量铺开成一维的
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:]
# 哇 是不会的诶!
num_features = 1
for s in size:
num_features *= s
return num_features
net = Net()
print(net)
params = list(net.parameters())
# print(params)
# print(params.size())
print(params[0].size())
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)
net.zero_grad()
out.backward(torch.randn(1, 10))
output = net(input)
target = torch.randn(10)
# 这样得到的target形状是10,我们需要转化它为[1, 10]
target = target.view(1, -1)
# print(target.size())
criterion = nn.MSELoss()
loss = criterion(output, target)
# print(loss)
# print(loss.grad_fn)
# print(loss.grad_fn.next_functions[0][0])
# print(loss.grad_fn.next_functions[0][0].next_functions[0][0])
net.zero_grad()
# 先清零梯度,不然会发生累积
print('conv1.bias.grad before backward:')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward:')
print(net.conv1.bias.grad)
learning_rate = 0.01
for f in net.parameters:
f.data.sub_(f.grad.data * learning_rate)
# 实现weight = weight - learning_rate * gradient
import torch.optim as optim
# 用这个包实现SGD / Nesterov-SGD / Adam / RMSProp.etc
optimizer = optim.SGD(net.parameters(), lr = 0.01)
# net的参数 / 学习率
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Do the update
# 注意: 每次一定要做zero_grad
PART4
# What about data?
# Generally, when you have to deal with image, text, audio
# or video data, you can use standard python packages that
# load data into a numpy array. Then you can convert this
# array into a torch.*Tensor.
# For images, packages such as Pillow, OpenCV are useful
# For audio, packages such as scipy and librosa
# For text, either raw Python or Cython based loading, or
# NLTK and SpaCy are useful
# CIFAR-10和CIFAR-100是带有标签的数据集
import torch
import torchvision
import torchvision.transforms as transforms
# torchvision中提供的数据集是[0, 1]间的实数。
# 我们要把它转化成范围为[-1, 1]的实数。
# 注意!如果在Windows上运行,返回一个BrokenPipeError
# 的错误,请尝试把torch.utils.data.DataLoader() 的
# num_worker设置为0
transform = transforms.Compose(
[
transforms.ToTensor(),
# 转化为tensor
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# 前面的(0.5, 0.5, 0.5)是RGB通道上的均值
# 后面(0.5, 0.5, 0.5)是三个通道的标准差
# image = (图像 - 平均值) / std
# 通过这个转化会把它变成[-1, 1]上的值
]
)
trainset = torchvision.datasets.CIFAR10(
root = './data',
train = True, # 训练数据
download = True,
transform = transform
)
trainloader = torch.utils.data.DataLoader(
trainset, # 训练数据来源
batch_size = 4, # 每次训练几个数据: default = 1
shuffle = True, # 是否随机打乱
# num_workers = 1 # 使用几个subprocesses
)
testset = torchvision.datasets.CIFAR10(
root = './data',
train = False, # 非训练数据
download = True,
transform = transform
)
testloader = torch.utils.data.DataLoader(
testset,
batch_size = 4,
shuffle = False, # 每次不随机打乱
# num_workers = 1
)
classes = ('plane', 'car', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck')
import matplotlib.pyplot as plt
import numpy as np
# functions to show an image
def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
# transpose 是一个轴变换操作, x是0, y是1, z是2
# 此处:y -> x, z -> y, x -> z
plt.show()
# get some random training images
'''
dataiter = iter(trainloader)
images, lables = dataiter.next()
# 用dataiter来遍历所有的数据
# 由于shuffle为True, 每次都会随机选择。
# show images
imshow(torchvision.utils.make_grid(images))
# make_grid: 画图 &网格
# print lables
print(' '.join(classes[lables[j]] for j in range(4)))
# .join: 用' '链接join里面的字符串
'''
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# self.conv1 = nn.Conv2d(3, 6, 5)
self.conv1 = nn.Conv2d(3, 100, 5)
self.pool = nn.MaxPool2d(2, 2)
# self.conv2 = nn.Conv2d(6, 16, 5)
self.conv2 = nn.Conv2d(100, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001, momentum = 0.9)
# 网络元素 & 学习率 & 动量因子
path = './cifar_net.pth'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
# 这个操作会递归地把所有模块转变为gpu上的模块
if (input() == "Train"):
for epoch in range(10): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# enumerate (sequence, start = 0)
# i是当前标号,data是获取到的元素
inputs, lables = data
# inputs = inputs
# lables = lables
inputs = inputs.to(device)
lables = lables.to(device)
# 由于网络本身不是很大,所以对速度提升不太明显
# zero the parameter gradients first
optimizer.zero_grad()
#forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, lables)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
torch.save(net.state_dict(), path)
# net.state_dict() -> 当前参数状态
dataiter = iter(testloader)
images, lables = dataiter.next()
#print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join(classes[lables[j]] for j in range(4)))
net = Net()
net.load_state_dict(torch.load(path))
# load back in our saved model
correct = 0
total = 0
with torch.no_grad():
# 在接下来的操作中不自动计算grad
for data in testloader:
images, lables = data
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
# 第一个参数是softmax函数输出的一个tensor
# 第二个参数: 维度索引0/1, 0是每列max, 1是每行max
# 返回两个Tensor, 第一个tensor是每行最大值,第二个tensor是最大值索引
total += lables.size(0)
correct += (predicted == lables).sum().item()
# tensor.sum() 返回总和
# tensor.item() 把Tensor转换为一个数字
print('Accuracy of the network on the 10000 test images: %d %%' % (
100 * correct / total))
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
# list(a, b, c)代表创建一个为[a, b, c]的列表
# 0. 是一个double型的0
# for i in range(10)是创建10个这样的0
with torch.no_grad():
for data in testloader:
images, lables = data
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
c = (predicted == lables).squeeze()
# squeeze: 把他们的结果压缩成一个Bool列表
for i in range(4):
# 数据集大小为4, 所以每次取testloader里面样本都有四个
lable = lables[i]
class_correct[lable] += c[i].item()
# True为1, False为0
class_total[lable] += 1
for i in range(10):
print("Accuracy of %s : %2d %%" % (
classes[i], 100 * class_correct[i] / class_total[i]))
# 附注:让人沮丧的是,简单的加深深度并不能有效地提高识别精度,甚至会导致loss函数不收敛。