自定义数据和数据集
数据集的准备:
训练集中包含60000个样本;测试集中包含10000个样本;
Dataset类
三个重要的属性:
__init__(): 初始化
__len__(): 返回数据集的长度;
__getitem__(): 根据提供的索引,返回数据集中的样本
应用一:使用Dataset类的小样本
需要重写Dataset类函数:__len__() 和__getitem__()
1 ''' 2 We can do amazing things with PyTorch Dataset class. We need to ensure that we are overriding two 3 of it's functions, 4 `__len__()`: returns the size of the dataset, that is, total number of samples. 5 `__getitem__()`: when given an index, returns the data sample correspoding to that index. 6 ''' 7 import numpy as np 8 from torch.utils.data import Dataset 9 class ExampleDataset(Dataset): #自定义一个类 10 def __init__(self, data): #初始化,把数据作为一个参数传递给类; 11 self.data = data 12 def __len__(self): 13 return len(self.data) #返回数据的长度 14 15 def __getitem__(self, idx): 16 return self.data[idx] #根据索引返回数据
调用:
1 sample_data = np.arange(0, 10) 2 print('The whole data: ', sample_data) 3 dataset = ExampleDataset(sample_data) 4 print('Number of samples in the data: ', len(dataset)) 5 print(dataset[2]) 6 print(dataset[0:5]) 7 8 输出: 9 The whole data: [0 1 2 3 4 5 6 7 8 9] 10 Number of samples in the data: 10 11 2 12 [0 1 2 3 4]
应用二:从CSV文件中加载数据集和DataLoaders
导入需要的包:
1 import pandas as pd 2 import numpy as np 3 import torch 4 import torchvision 5 import torch.nn as nn 6 import torch.nn.functional as F 7 import torch.optim as optim 8 from torchvision.transforms import transforms 9 from torch.utils.data import DataLoader 10 from torch.utils.data import Dataset
获取设备:
1 def get_device(): 2 if torch.cuda.is_available(): 3 device = 'cuda:0' 4 else: 5 device = 'cpu' 6 return device 7 device = get_device()
加载和准备数据
1 # read the data 2 df_train = pd.read_csv('mnist_train.csv') 3 df_test = pd.read_csv('mnist_test.csv') 4 # get the image pixel values and labels 5 train_labels = df_train.iloc[:, 0] 6 train_images = df_train.iloc[:, 1:] 7 test_labels = df_test.iloc[:, 0] 8 test_images = df_test.iloc[:, 1:]
定义图像变换:
1 # define transforms 2 transform = transforms.Compose( 3 [transforms.ToPILImage(), 4 transforms.ToTensor(), 5 transforms.Normalize((0.5, ), (0.5, )) 6 ])
准备自定义Dataset和DataLoaders
1 # custom dataset 2 class MNISTDataset(Dataset): 3 def __init__(self, images, labels=None, transforms=None):#Labels和tran默认为NOne 4 self.X = images 5 self.y = labels 6 self.transforms = transforms 7 8 def __len__(self): 9 return (len(self.X)) #长度和之前一样,返回数据的长度 10 11 def __getitem__(self, i): 12 data = self.X.iloc[i, :] 13 data = np.asarray(data).astype(np.uint8).reshape(28, 28, 1) 14 15 if self.transforms: 16 data = self.transforms(data) 17 18 if self.y is not None: 19 return (data, self.y[i]) 20 else: 21 return data 22 train_data = MNISTDataset(train_images, train_labels, transform) 23 test_data = MNISTDataset(test_images, test_labels, transform) 24 # dataloaders 25 trainloader = DataLoader(train_data, batch_size=128, shuffle=True) 26 testloader = DataLoader(test_data, batch_size=128, shuffle=True)
接下来就是定义网络、训练以及测试它
网络定义:
1 # define the neural net class 2 class Net(nn.Module): 3 def __init__(self): 4 super(Net, self).__init__() 5 self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, 6 kernel_size=5, stride=1) 7 self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, 8 kernel_size=5, stride=1) 9 self.fc1 = nn.Linear(in_features=800, out_features=500) 10 self.fc2 = nn.Linear(in_features=500, out_features=10) 11 def forward(self, x): 12 x = F.relu(self.conv1(x)) 13 x = F.max_pool2d(x, 2, 2) 14 x = F.relu(self.conv2(x)) 15 x = F.max_pool2d(x, 2, 2) 16 x = x.view(x.size(0), -1) 17 x = F.relu(self.fc1(x)) 18 x = self.fc2(x) 19 return x 20 net = Net().to(device) 21 print(net)
优化器和损失函数:
1 # loss 2 criterion = nn.CrossEntropyLoss() 3 # optimizer 4 optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
训练和测试网络:
1 def train(net, trainloader): 2 for epoch in range(10): # no. of epochs 3 running_loss = 0 4 for data in trainloader: 5 # data pixels and labels to GPU if available 6 inputs, labels = data[0].to(device, non_blocking=True), data[1].to(device, non_blocking=True) 7 # set the parameter gradients to zero 8 optimizer.zero_grad() 9 outputs = net(inputs) 10 loss = criterion(outputs, labels) 11 # propagate the loss backward 12 loss.backward() 13 # update the gradients 14 optimizer.step() 15 16 running_loss += loss.item() 17 print('[Epoch %d] loss: %.3f' % 18 (epoch + 1, running_loss/len(trainloader))) 19 20 print('Done Training') 21 def test(net, testloader): 22 correct = 0 23 total = 0 24 with torch.no_grad(): 25 for data in testloader: 26 inputs, labels = data[0].to(device, non_blocking=True), data[1].to(device, non_blocking=True) 27 outputs = net(inputs) 28 _, predicted = torch.max(outputs.data, 1) 29 total += labels.size(0) 30 correct += (predicted == labels).sum().item() 31 print('Accuracy of the network on test images: %0.3f %%' % ( 32 100 * correct / total)) 33 34 train(net, trainloader) 35 test(net, testloader)