自定义数据和数据集

数据集的准备:

训练集

测试集

训练集中包含60000个样本;测试集中包含10000个样本;

 

Dataset类

三个重要的属性:

__init__(): 初始化

__len__(): 返回数据集的长度;

__getitem__(): 根据提供的索引,返回数据集中的样本

 

应用一:使用Dataset类的小样本

需要重写Dataset类函数:__len__() 和__getitem__()

 1 '''
 2 We can do amazing things with PyTorch Dataset class. We need to ensure that we are overriding two 
 3 of it's functions, 
 4 `__len__()`: returns the size of the dataset, that is, total number of samples.
 5 `__getitem__()`: when given an index, returns the data sample correspoding to that index.
 6 '''
 7 import numpy as np
 8 from torch.utils.data import Dataset
 9 class ExampleDataset(Dataset): #自定义一个类
10     def __init__(self, data): #初始化,把数据作为一个参数传递给类;
11         self.data = data
12     def __len__(self):
13         return len(self.data)  #返回数据的长度
14     
15     def __getitem__(self, idx):
16         return self.data[idx]  #根据索引返回数据

调用:

 1 sample_data = np.arange(0, 10)
 2 print('The whole data: ', sample_data)
 3 dataset = ExampleDataset(sample_data)
 4 print('Number of samples in the data: ', len(dataset))
 5 print(dataset[2])
 6 print(dataset[0:5])
 7 
 8 输出:
 9 The whole data:  [0 1 2 3 4 5 6 7 8 9]
10 Number of samples in the data:  10
11 2
12 [0 1 2 3 4]

应用二:从CSV文件中加载数据集和DataLoaders

导入需要的包:

 1 import pandas as pd
 2 import numpy as np
 3 import torch
 4 import torchvision
 5 import torch.nn as nn
 6 import torch.nn.functional as F
 7 import torch.optim as optim 
 8 from torchvision.transforms import transforms
 9 from torch.utils.data import DataLoader
10 from torch.utils.data import Dataset

获取设备:

1 def get_device():
2     if torch.cuda.is_available():
3         device = 'cuda:0'
4     else:
5         device = 'cpu'
6     return device
7 device = get_device()

加载和准备数据

1 # read the data
2 df_train = pd.read_csv('mnist_train.csv')
3 df_test = pd.read_csv('mnist_test.csv')
4 # get the image pixel values and labels
5 train_labels = df_train.iloc[:, 0]
6 train_images = df_train.iloc[:, 1:]
7 test_labels = df_test.iloc[:, 0]
8 test_images = df_test.iloc[:, 1:]

定义图像变换:

1 # define transforms
2 transform = transforms.Compose(
3     [transforms.ToPILImage(),
4      transforms.ToTensor(),
5      transforms.Normalize((0.5, ), (0.5, ))
6 ])

准备自定义Dataset和DataLoaders

 1 # custom dataset
 2 class MNISTDataset(Dataset):
 3     def __init__(self, images, labels=None, transforms=None):#Labels和tran默认为NOne
 4         self.X = images
 5         self.y = labels
 6         self.transforms = transforms
 7          
 8     def __len__(self):
 9         return (len(self.X)) #长度和之前一样,返回数据的长度
10     
11     def __getitem__(self, i):
12         data = self.X.iloc[i, :]
13         data = np.asarray(data).astype(np.uint8).reshape(28, 28, 1)
14         
15         if self.transforms:
16             data = self.transforms(data)
17             
18         if self.y is not None:
19             return (data, self.y[i])
20         else:
21             return data
22 train_data = MNISTDataset(train_images, train_labels, transform)
23 test_data = MNISTDataset(test_images, test_labels, transform)
24 # dataloaders
25 trainloader = DataLoader(train_data, batch_size=128, shuffle=True)
26 testloader = DataLoader(test_data, batch_size=128, shuffle=True)

接下来就是定义网络、训练以及测试它

网络定义:

 1 # define the neural net class
 2 class Net(nn.Module):
 3     def __init__(self):
 4         super(Net, self).__init__()
 5         self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, 
 6                                kernel_size=5, stride=1)
 7         self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, 
 8                                kernel_size=5, stride=1)
 9         self.fc1 = nn.Linear(in_features=800, out_features=500)
10         self.fc2 = nn.Linear(in_features=500, out_features=10)
11     def forward(self, x):
12         x = F.relu(self.conv1(x))
13         x = F.max_pool2d(x, 2, 2)
14         x = F.relu(self.conv2(x))
15         x = F.max_pool2d(x, 2, 2)
16         x = x.view(x.size(0), -1)
17         x = F.relu(self.fc1(x))
18         x = self.fc2(x)
19         return x
20 net = Net().to(device)
21 print(net)

优化器和损失函数:

1 # loss
2 criterion = nn.CrossEntropyLoss()
3 # optimizer
4 optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

训练和测试网络:

 1 def train(net, trainloader):
 2     for epoch in range(10): # no. of epochs
 3         running_loss = 0
 4         for data in trainloader:
 5             # data pixels and labels to GPU if available
 6             inputs, labels = data[0].to(device, non_blocking=True), data[1].to(device, non_blocking=True)
 7             # set the parameter gradients to zero
 8             optimizer.zero_grad()
 9             outputs = net(inputs)
10             loss = criterion(outputs, labels)
11             # propagate the loss backward
12             loss.backward()
13             # update the gradients
14             optimizer.step()
15  
16             running_loss += loss.item()
17         print('[Epoch %d] loss: %.3f' %
18                       (epoch + 1, running_loss/len(trainloader)))
19  
20     print('Done Training')
21 def test(net, testloader):
22     correct = 0
23     total = 0
24     with torch.no_grad():
25         for data in testloader:
26             inputs, labels = data[0].to(device, non_blocking=True), data[1].to(device, non_blocking=True)
27             outputs = net(inputs)
28             _, predicted = torch.max(outputs.data, 1)
29             total += labels.size(0)
30             correct += (predicted == labels).sum().item()
31     print('Accuracy of the network on test images: %0.3f %%' % (
32         100 * correct / total))
33     
34 train(net, trainloader)
35 test(net, testloader)

 

posted @ 2021-05-21 22:58  hi_mxd  阅读(184)  评论(0编辑  收藏  举报