HW1
固定随机数种子:
def same_seed(seed):
'''Fixes random number generator seeds for reproducibility.'''
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
划分数据集(torch.utils.data.random_split):
def train_valid_split(data_set, valid_ratio, seed):
'''Split provided training data into training set and validation set'''
valid_set_size = int(valid_ratio * len(data_set))
train_set_size = len(data_set) - valid_set_size
train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
return np.array(train_set), np.array(valid_set)
random_split还可以帮助我们划分成训练集、验证集和测试集:
import numpy as np
import torch
from torch.utils.data import random_split
# 创建数据集,100个样本,10个特征
data = np.arange(1000).reshape((100, 10)) # 输入是ndarray,如果导入的是csv文件(df),那么data = df.values
print(data.shape) # (100, 10)
# 按照7: 2: 1划分训练、验证和测试集
data_split = random_split(data, [70, 20, 10], generator=torch.Generator().manual_seed(520))
train_data, valid_data, test_data = data_split
# 返回的切分后的数据需要转换成ndarray,整个数据的切分是从ndarray到ndarray的过程!
print(np.array(train_data).shape) # (70, 10)
print(np.array(valid_data).shape) # (20, 10)
print(np.array(test_data).shape) # (10, 10)
early_stop策略用在模型的训练和验证上:
import math
import torch
# 初始化
best_loss, early_stop_count = math.inf, 0
# 用在每一个epoch的训练和验证后面
if mean_valid_loss < best_loss:
best_loss = mean_valid_loss
torch.save(model.state_dict(), config['save_path']) # Save your best model
print('Saving model with loss {:.3f}...'.format(best_loss))
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= config['early_stop']:
print('\nModel is not improving, so we halt the training session.')
把所有的参数放在一个config字典中,方便调整:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 5201314, # Your seed number, you can pick your lucky number. :)
'select_all': True, # Whether to use all features.
'valid_ratio': 0.2, # validation_size = train_size * valid_ratio
'n_epochs': 3000, # Number of epochs.
'batch_size': 256,
'learning_rate': 1e-5,
'early_stop': 400, # If model has not improved for this many consecutive epochs, stop training.
'save_path': './models/model.ckpt' # Your model will be saved here.
}
测试结果保存为csv文件(import csv):
def save_pred(preds, file):
''' Save predictions to specified file '''
with open(file, 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['id', 'tested_positive'])
for i, p in enumerate(preds):
writer.writerow([i, p])