深度学习训练模型时保存Log输出信息为文件
使用logging包实现边在命令行输出结果,边保存结果为Log文件
首先自定义一个Logger类,调用Logging包实现功能,实例化一个对象logger,直接调用logger.info,例如
logger.info(" ===cost time:{:.4f}s".format(end - start))
完整的示例如下,包含logging记录信息以及tensorboard的summary监督指标(https://www.cnblogs.com/ywheunji/p/10712620.html)。参照示例,直接添加logger.info信息即可录入文件。
1 class Logger(object): 2 def __init__(self, log_file_name, log_level, logger_name): 3 # firstly, create a logger 4 self.__logger = logging.getLogger(logger_name) 5 self.__logger.setLevel(log_level) 6 # secondly, create a handler 7 file_handler = logging.FileHandler(log_file_name) 8 console_handler = logging.StreamHandler() 9 # thirdly, define the output form of handler 10 formatter = logging.Formatter( 11 "[%(asctime)s]-[%(filename)s line:%(lineno)d]:%(message)s " 12 ) 13 file_handler.setFormatter(formatter) 14 console_handler.setFormatter(formatter) 15 # finally, add the Hander to logger 16 self.__logger.addHandler(file_handler) 17 self.__logger.addHandler(console_handler) 18 19 def get_log(self): 20 return self.__logger 21 22 class Trainer(object): 23 def __init__(self, weight_path, resume, gpu_id, accumulate, fp_16): 24 init_seeds(0) 25 self.train_dataloader = DataLoader( 26 self.train_dataset, 27 batch_size=cfg.TRAIN["BATCH_SIZE"], 28 num_workers=cfg.TRAIN["NUMBER_WORKERS"], 29 shuffle=True, 30 pin_memory=True, 31 ) 32 33 self.yolov4 = Build_Model(weight_path=weight_path, resume=resume).to( 34 self.device 35 ) 36 37 def train(self): 38 global writer 39 logger.info( 40 "Training start,img size is: {:d},batchsize is: {:d},work number is {:d}".format( 41 cfg.TRAIN["TRAIN_IMG_SIZE"], 42 cfg.TRAIN["BATCH_SIZE"], 43 cfg.TRAIN["NUMBER_WORKERS"], 44 ) 45 ) 46 logger.info(self.yolov4) 47 logger.info( 48 "Train datasets number is : {}".format(len(self.train_dataset)) 49 ) 50 51 if self.fp_16: 52 self.yolov4, self.optimizer = amp.initialize( 53 self.yolov4, self.optimizer, opt_level="O1", verbosity=0 54 ) 55 logger.info(" ======= start training ====== ") 56 for epoch in range(self.start_epoch, self.epochs): 57 start = time.time() 58 self.yolov4.train() 59 60 mloss = torch.zeros(4) 61 logger.info("===Epoch:[{}/{}]===".format(epoch, self.epochs)) 62 for i, (imgs, label_sbbox, 63 ) in enumerate(self.train_dataloader): 64 65 loss, loss_ciou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox) 66 67 loss.backward() 68 # Print batch results 69 if i % 10 == 0: 70 logger.info( 71 " === Epoch:[{:3}/{}],step:[{:3}/{}],img_size:[{:3}],total_loss:{:.4f}|loss_ciou:{:.4f}|loss_conf:{:.4f}|loss_cls:{:.4f}|lr:{:.4f}".format( 72 epoch, 73 self.epochs, 74 i, 75 len(self.train_dataloader) - 1, 76 self.train_dataset.img_size, 77 mloss[3], 78 mloss[0], 79 mloss[1], 80 mloss[2], 81 self.optimizer.param_groups[0]["lr"], 82 ) 83 ) 84 writer.add_scalar( 85 "loss_ciou", 86 mloss[0], 87 len(self.train_dataloader) 88 * epoch 89 + i, 90 ) 91 writer.add_scalar( 92 "train_loss", 93 mloss[3], 94 len(self.train_dataloader) 95 * epoch 96 + i, 97 ) 98 99 100 # eval 101 logger.info( 102 "===== Validate =====".format(epoch, self.epochs) 103 ) 104 logger.info("val img size is {}".format(cfg.VAL["TEST_IMG_SIZE"])) 105 with torch.no_grad(): 106 APs, inference_time = Evaluator( 107 self.yolov4, showatt=False 108 ).APs_voc() 109 for i in APs: 110 logger.info("{} --> mAP : {}".format(i, APs[i])) 111 mAP += APs[i] 112 mAP = mAP / self.train_dataset.num_classes 113 logger.info("mAP : {}".format(mAP)) 114 logger.info( 115 "inference time: {:.2f} ms".format(inference_time) 116 ) 117 writer.add_scalar("mAP", mAP, epoch) 118 self.__save_model_weights(epoch, mAP) 119 logger.info("save weights done") 120 logger.info(" ===test mAP:{:.3f}".format(mAP)) 121 122 if __name__ == "__main__": 123 global logger, writer 124 writer = SummaryWriter(logdir=opt.log_path + "/event") 125 logger = Logger( 126 log_file_name=opt.log_path + "/log.txt", 127 log_level=logging.DEBUG, 128 logger_name="YOLOv4", 129 ).get_log() 130 131 Trainer( 132 weight_path=opt.weight_path, 133 resume=opt.resume, 134 gpu_id=opt.gpu_id, 135 accumulate=opt.accumulate, 136 fp_16=opt.fp_16, 137 ).train()
凤舞九天