mxnet系列教程之1-第一个例子
第一个例子当然是mnist的例子
假设已经成功安装了mxnet
例子的代码如下:
cd mxnet/example/image-classification python train_mnist.py这样就会运行下去
train_mnist.py的代码为
""" Train mnist, see more explanation at http://mxnet.io/tutorials/python/mnist.html """ import os import argparse import logging logging.basicConfig(level=logging.DEBUG) from common import find_mxnet, fit from common.util import download_file import mxnet as mx import numpy as np import gzip, struct def read_data(label, image): """ download and read data into numpy """ base_url = 'http://yann.lecun.com/exdb/mnist/' with gzip.open(download_file(base_url+label, os.path.join('data',label))) as flbl: magic, num = struct.unpack(">II", flbl.read(8)) label = np.fromstring(flbl.read(), dtype=np.int8) with gzip.open(download_file(base_url+image, os.path.join('data',image)), 'rb') as fimg: magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols) return (label, image) def to4d(img): """ reshape to 4D arrays """ return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255 def get_mnist_iter(args, kv): """ create data iterator with NDArrayIter """ (train_lbl, train_img) = read_data( 'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz') (val_lbl, val_img) = read_data( 't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz') train = mx.io.NDArrayIter( to4d(train_img), train_lbl, args.batch_size, shuffle=True) val = mx.io.NDArrayIter( to4d(val_img), val_lbl, args.batch_size) return (train, val) if __name__ == '__main__': # parse args parser = argparse.ArgumentParser(description="train mnist", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--num-classes', type=int, default=10, help='the number of classes') parser.add_argument('--num-examples', type=int, default=60000, help='the number of training examples') fit.add_fit_args(parser) parser.set_defaults( # network network = 'mlp', # train gpus = '0,1', batch_size = 64, disp_batches = 100, num_epochs = 20, lr = .05, lr_step_epochs = '10', model_predix = './my' ) args = parser.parse_args() # load network from importlib import import_module net = import_module('symbols.'+args.network) sym = net.get_symbol(**vars(args)) # train fit.fit(args, sym, get_mnist_iter)
net写在了symbol文件夹中,相当于caffe的prototxt文件
model_predix相当于caffe的保存模型前缀
打印的信息为Saved checkpoint to"./my-0001.params"
""" a simple multilayer perceptron """ import mxnet as mx def get_symbol(num_classes=10, **kwargs): data = mx.symbol.Variable('data') data = mx.sym.Flatten(data=data) fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') return mlp
fit.py里面写了运行的代码
import mxnet as mx import logging import os import time def _get_lr_scheduler(args, kv): if 'lr_factor' not in args or args.lr_factor >= 1: return (args.lr, None) epoch_size = args.num_examples / args.batch_size if 'dist' in args.kv_store: epoch_size /= kv.num_workers begin_epoch = args.load_epoch if args.load_epoch else 0 step_epochs = [int(l) for l in args.lr_step_epochs.split(',')] lr = args.lr for s in step_epochs: if begin_epoch >= s: lr *= args.lr_factor if lr != args.lr: logging.info('Adjust learning rate to %e for epoch %d' %(lr, begin_epoch)) steps = [epoch_size * (x-begin_epoch) for x in step_epochs if x-begin_epoch > 0] return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor)) def _load_model(args, rank=0): if 'load_epoch' not in args or args.load_epoch is None: return (None, None, None) assert args.model_prefix is not None model_prefix = args.model_prefix if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)): model_prefix += "-%d" % (rank) sym, arg_params, aux_params = mx.model.load_checkpoint( model_prefix, args.load_epoch) logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch) return (sym, arg_params, aux_params) def _save_model(args, rank=0): if args.model_prefix is None: return None dst_dir = os.path.dirname(args.model_prefix) if not os.path.isdir(dst_dir): os.mkdir(dst_dir) return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % ( args.model_prefix, rank)) def add_fit_args(parser): """ parser : argparse.ArgumentParser return a parser added with args required by fit """ train = parser.add_argument_group('Training', 'model training') train.add_argument('--network', type=str, help='the neural network to use') train.add_argument('--num-layers', type=int, help='number of layers in the neural network, required by some networks such as resnet') train.add_argument('--gpus', type=str, help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') train.add_argument('--kv-store', type=str, default='device', help='key-value store type') train.add_argument('--num-epochs', type=int, default=100, help='max num of epochs') train.add_argument('--lr', type=float, default=0.1, help='initial learning rate') train.add_argument('--lr-factor', type=float, default=0.1, help='the ratio to reduce lr on each step') train.add_argument('--lr-step-epochs', type=str, help='the epochs to reduce the lr, e.g. 30,60') train.add_argument('--optimizer', type=str, default='sgd', help='the optimizer type') train.add_argument('--mom', type=float, default=0.9, help='momentum for sgd') train.add_argument('--wd', type=float, default=0.0001, help='weight decay for sgd') train.add_argument('--batch-size', type=int, default=128, help='the batch size') train.add_argument('--disp-batches', type=int, default=20, help='show progress for every n batches') train.add_argument('--model-prefix', type=str, help='model prefix') parser.add_argument('--monitor', dest='monitor', type=int, default=0, help='log network parameters every N iters if larger than 0') train.add_argument('--load-epoch', type=int, help='load the model on an epoch using the model-load-prefix') train.add_argument('--top-k', type=int, default=0, help='report the top-k accuracy. 0 means no report.') train.add_argument('--test-io', type=int, default=0, help='1 means test reading speed without training') return train def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # kvstore kv = mx.kvstore.create(args.kv_store) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) # data iterators (train, val) = data_loader(args, kv) if args.test_io: tic = time.time() for i, batch in enumerate(train): for j in batch.data: j.wait_to_read() if (i+1) % args.disp_batches == 0: logging.info('Batch [%d]\tSpeed: %.2f samples/sec' % ( i, args.disp_batches*args.batch_size/(time.time()-tic))) tic = time.time() return # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, kv.rank) if sym is not None: assert sym.tojson() == network.tojson() # save model checkpoint = _save_model(args, kv.rank) # devices for training devs = mx.cpu() if args.gpus is None or args.gpus is '' else [ mx.gpu(int(i)) for i in args.gpus.split(',')] # learning rate lr, lr_scheduler = _get_lr_scheduler(args, kv) # create model model = mx.mod.Module( context = devs, symbol = network ) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'momentum' : args.mom, 'wd' : args.wd, 'lr_scheduler': lr_scheduler} monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None initializer = mx.init.Xavier( rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=args.top_k)) # callbacks that run after each batch batch_end_callbacks = [mx.callback.Speedometer(args.batch_size, args.disp_batches)] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] # run model.fit(train, begin_epoch = args.load_epoch if args.load_epoch else 0, num_epoch = args.num_epochs, eval_data = val, eval_metric = eval_metrics, kvstore = kv, optimizer = args.optimizer, optimizer_params = optimizer_params, initializer = initializer, arg_params = arg_params, aux_params = aux_params, batch_end_callback = batch_end_callbacks, epoch_end_callback = checkpoint, allow_missing = True, monitor = monitor)