一个单机多卡训练模型的例子
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | # """My demo train script.""" import argparse import logging import os import random import time import numpy as np import torch from torch import nn, optim, Tensor from torch.utils.data import DataLoader, Dataset def parse_args() - > argparse.Namespace: """Parse arguments.""" parser = argparse.ArgumentParser(description = "Training" ) parser.add_argument( "--seed" , type = int , help = "Fix random seed" , default = 123 ) parser.add_argument( "--log_file" , type = str , help = "Log file" , default = "test_train.log" ) parser.add_argument( "--log_path" , type = str , help = "Model path" , default = "./training_log/" ) parser.add_argument( "--train_epochs" , type = int , help = "Epochs of training" , default = 5 ) parser.add_argument( "--batch_size" , type = int , help = "Batch size" , default = 32 ) parser.add_argument( "--learning_rate" , type = float , help = "Learning rate" , default = 1e - 3 , ) parser.add_argument( "--device" , type = str , help = "Run on which device" , default = "cpu" ) parser.add_argument( "--cuda_visible_devices" , type = str , help = "Cuda visible devices" , default = "0" ) return parser.parse_args() def init_logging(log_file: str , level: str = "INFO" ) - > None : """Initialize logging.""" logging.basicConfig( filename = log_file, filemode = "w" , format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" , level = level, ) logging.getLogger().addHandler(logging.StreamHandler()) def set_seed(seed: int ) - > None : """Set seed for reproducibility.""" os.environ[ "PYTHONHASHSEED" ] = str (seed) os.environ[ "CUBLAS_WORKSPACE_CONFIG" ] = ":4096:8" random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.use_deterministic_algorithms( True ) def seed_worker(work_id: int ) - > None : """Set seed for worker.""" np.random.seed(work_id) random.seed(work_id) class DatasetClass(Dataset): """My demo dataset class.""" def __init__( self ): self . input = np.random.rand( 1000000 , 2 ).astype(np.float32) # self.input[:, 1] = 0.0 self .target = np.zeros([ 1000000 , 1 ]) self .target[:, 0 ] = self . input [:, 0 ] + 1.0 def __len__( self ): return len ( self . input ) def __getitem__( self , idx: int ) - > tuple : return self . input [idx], self .target[idx] class ModelClass(torch.nn.Module): """My demo model class.""" def __init__( self ): super ().__init__() self .my_layer = nn.Linear( 2 , 1 ) def forward( self , inputs: Tensor) - > Tensor: """My demo forward function.""" outputs = self .my_layer(inputs) return outputs def get_loss(model_output: Tensor, target: Tensor) - > Tensor: """My demo loss function.""" loss = torch.norm(model_output - target, dim = - 1 ). sum () return loss def training() - > None : """My demo training function.""" train_set = DatasetClass() g = torch.Generator() g.manual_seed(args.seed) train_loader = DataLoader( dataset = train_set, batch_size = args.batch_size, shuffle = True , num_workers = os.cpu_count(), pin_memory = True , worker_init_fn = seed_worker, generator = g, ) model = ModelClass() if args.device = = "cuda" : model = nn.DataParallel(model) model.to(args.device) optimizer = optim.AdamW(model.parameters(), lr = args.learning_rate) for epoch in range (args.train_epochs): model.train() for batch_index, (features, labels) in enumerate (train_loader): features = features.to(args.device) labels = labels.to(args.device) model_outputs = model(features) optimizer.zero_grad(set_to_none = True ) loss = get_loss(model_outputs, labels) loss.backward() optimizer.step() if batch_index % 1000 = = 0 : logging.info( "Epoch: %s, Batch index: %s, Loss: %s" , epoch, batch_index, loss.item(), ) torch.save(model.state_dict(), f "{args.log_path}/trained_model.pth" ) def testing() - > None : """My demo testing function.""" test_set = DatasetClass() g = torch.Generator() g.manual_seed(args.seed) test_loader = DataLoader( dataset = test_set, batch_size = args.batch_size, shuffle = True , num_workers = os.cpu_count(), pin_memory = True , worker_init_fn = seed_worker, generator = g, ) model = ModelClass() if args.device = = "cuda" : model = nn.DataParallel(model) model.load_state_dict(torch.load(f "{args.log_path}/trained_model.pth" )) model.to(args.device) model. eval () with torch.no_grad(): for batch_index, (features, labels) in enumerate (test_loader): features = features.to(args.device) labels = labels.to(args.device) model_outputs = model(features) loss = get_loss(model_outputs, labels) if batch_index % 1000 = = 0 : logging.info( "Batch index: %s, Loss: %s" , batch_index, loss.item() / args.batch_size, ) if __name__ = = "__main__" : args = parse_args() set_seed(args.seed) init_logging(args.log_file) os.environ[ "CUDA_VISIBLE_DEVICES" ] = args.cuda_visible_devices main_start_time = time.time() training() main_end_time = time.time() logging.info( "Main time: %s" , main_end_time - main_start_time) testing() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!