深度学习训练模型时保存Log输出信息为文件

使用logging包实现边在命令行输出结果,边保存结果为Log文件

首先自定义一个Logger类,调用Logging包实现功能,实例化一个对象logger,直接调用logger.info,例如

            logger.info("  ===cost time:{:.4f}s".format(end - start))

完整的示例如下,包含logging记录信息以及tensorboard的summary监督指标(https://www.cnblogs.com/ywheunji/p/10712620.html)。参照示例,直接添加logger.info信息即可录入文件。

  1 class Logger(object):
  2     def __init__(self, log_file_name, log_level, logger_name):
  3         # firstly, create a logger
  4         self.__logger = logging.getLogger(logger_name)
  5         self.__logger.setLevel(log_level)
  6         # secondly, create a handler
  7         file_handler = logging.FileHandler(log_file_name)
  8         console_handler = logging.StreamHandler()
  9         # thirdly, define the output form of handler
 10         formatter = logging.Formatter(
 11             "[%(asctime)s]-[%(filename)s line:%(lineno)d]:%(message)s "
 12         )
 13         file_handler.setFormatter(formatter)
 14         console_handler.setFormatter(formatter)
 15         # finally, add the Hander to logger
 16         self.__logger.addHandler(file_handler)
 17         self.__logger.addHandler(console_handler)
 18 
 19     def get_log(self):
 20         return self.__logger
 21         
 22 class Trainer(object):
 23     def __init__(self, weight_path, resume, gpu_id, accumulate, fp_16):
 24         init_seeds(0)
 25         self.train_dataloader = DataLoader(
 26             self.train_dataset,
 27             batch_size=cfg.TRAIN["BATCH_SIZE"],
 28             num_workers=cfg.TRAIN["NUMBER_WORKERS"],
 29             shuffle=True,
 30             pin_memory=True,
 31         )
 32 
 33         self.yolov4 = Build_Model(weight_path=weight_path, resume=resume).to(
 34             self.device
 35         )
 36 
 37     def train(self):
 38         global writer
 39         logger.info(
 40             "Training start,img size is: {:d},batchsize is: {:d},work number is {:d}".format(
 41                 cfg.TRAIN["TRAIN_IMG_SIZE"],
 42                 cfg.TRAIN["BATCH_SIZE"],
 43                 cfg.TRAIN["NUMBER_WORKERS"],
 44             )
 45         )
 46         logger.info(self.yolov4)
 47         logger.info(
 48             "Train datasets number is : {}".format(len(self.train_dataset))
 49         )
 50 
 51         if self.fp_16:
 52             self.yolov4, self.optimizer = amp.initialize(
 53                 self.yolov4, self.optimizer, opt_level="O1", verbosity=0
 54             )
 55         logger.info("        =======  start  training   ======     ")
 56         for epoch in range(self.start_epoch, self.epochs):
 57             start = time.time()
 58             self.yolov4.train()
 59 
 60             mloss = torch.zeros(4)
 61             logger.info("===Epoch:[{}/{}]===".format(epoch, self.epochs))
 62             for i, (imgs, label_sbbox,
 63             ) in enumerate(self.train_dataloader):
 64 
 65                 loss, loss_ciou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox)
 66 
 67                 loss.backward()
 68                 # Print batch results
 69                 if i % 10 == 0:
 70                     logger.info(
 71                         "  === Epoch:[{:3}/{}],step:[{:3}/{}],img_size:[{:3}],total_loss:{:.4f}|loss_ciou:{:.4f}|loss_conf:{:.4f}|loss_cls:{:.4f}|lr:{:.4f}".format(
 72                             epoch,
 73                             self.epochs,
 74                             i,
 75                             len(self.train_dataloader) - 1,
 76                             self.train_dataset.img_size,
 77                             mloss[3],
 78                             mloss[0],
 79                             mloss[1],
 80                             mloss[2],
 81                             self.optimizer.param_groups[0]["lr"],
 82                         )
 83                     )
 84                     writer.add_scalar(
 85                         "loss_ciou",
 86                         mloss[0],
 87                         len(self.train_dataloader)
 88                         * epoch
 89                         + i,
 90                     )
 91                     writer.add_scalar(
 92                         "train_loss",
 93                         mloss[3],
 94                         len(self.train_dataloader)
 95                         * epoch
 96                         + i,
 97                     )
 98 
 99 
100             # eval
101             logger.info(
102                 "===== Validate =====".format(epoch, self.epochs)
103             )
104             logger.info("val img size is {}".format(cfg.VAL["TEST_IMG_SIZE"]))
105             with torch.no_grad():
106                 APs, inference_time = Evaluator(
107                     self.yolov4, showatt=False
108                 ).APs_voc()
109                 for i in APs:
110                     logger.info("{} --> mAP : {}".format(i, APs[i]))
111                     mAP += APs[i]
112                 mAP = mAP / self.train_dataset.num_classes
113                 logger.info("mAP : {}".format(mAP))
114                 logger.info(
115                     "inference time: {:.2f} ms".format(inference_time)
116                 )
117                 writer.add_scalar("mAP", mAP, epoch)
118                 self.__save_model_weights(epoch, mAP)
119                 logger.info("save weights done")
120             logger.info("  ===test mAP:{:.3f}".format(mAP))
121 
122 if __name__ == "__main__":
123     global logger, writer
124     writer = SummaryWriter(logdir=opt.log_path + "/event")
125     logger = Logger(
126         log_file_name=opt.log_path + "/log.txt",
127         log_level=logging.DEBUG,
128         logger_name="YOLOv4",
129     ).get_log()
130 
131     Trainer(
132         weight_path=opt.weight_path,
133         resume=opt.resume,
134         gpu_id=opt.gpu_id,
135         accumulate=opt.accumulate,
136         fp_16=opt.fp_16,
137     ).train()

 

posted @ 2020-12-12 16:18  you-wh  阅读(2758)  评论(0编辑  收藏  举报
Fork me on GitHub