Train CIFAR-10 CNN model

using MXNet's "Module" interface


In [1]:
import mxnet
import mxnet as mx
import train_cifar10

In [2]:
# Set up the hyper-parameters
args = train_cifar10.command_line_args(defaults=True)
args.gpus = "0"
#args.network = "lenet"  # Fast, not very accurate
#args.network = "inception-bn-28-small"  # Much more accurate & slow

In [3]:
# Configure charts to plot while training
from mxnet.notebook.callback import LiveLearningCurve
cb_args = LiveLearningCurve('accuracy', 5).callback_args()


Loading BokehJS ...

In [4]:
# Start training
train_cifar10.do_train(args, 
    callback_args=cb_args,
)


2016-11-04 18:05:30,714 Node[0] start with arguments Namespace(batch_size=128, data_dir='/efs/datasets/users/leodirac/code/workplace/leodirac/mxnet/example/image-classification/cifar10/', gpus='0', kv_store='local', load_epoch=None, lr=0.05, lr_factor=1, lr_factor_epoch=1, model_prefix=None, network='inception-bn-28-small', num_epochs=20, num_examples=60000, save_model_prefix=None)
2016-11-04 18:05:30,715 Node[0] running on ip-172-31-59-245
2016-11-04 18:05:32,172 Node[0] Starting with devices [gpu(0)]
2016-11-04 18:05:32,175 Node[0] start training for 20 epochs...
2016-11-04 18:07:11,468 Node[0] Epoch[0] Train-accuracy=0.566211
2016-11-04 18:07:11,469 Node[0] Epoch[0] Train-top_k_accuracy_5=0.948242
2016-11-04 18:07:11,470 Node[0] Epoch[0] Train-top_k_accuracy_10=1.000000
2016-11-04 18:07:11,471 Node[0] Epoch[0] Train-top_k_accuracy_20=1.000000
2016-11-04 18:07:11,471 Node[0] Epoch[0] Time cost=98.542
2016-11-04 18:07:17,454 Node[0] Epoch[0] Validation-accuracy=nan
2016-11-04 18:07:17,455 Node[0] Epoch[0] Validation-top_k_accuracy_5=nan
2016-11-04 18:07:17,456 Node[0] Epoch[0] Validation-top_k_accuracy_10=nan
2016-11-04 18:07:17,457 Node[0] Epoch[0] Validation-top_k_accuracy_20=nan
2016-11-04 18:08:57,069 Node[0] Epoch[1] Train-accuracy=0.679883
2016-11-04 18:08:57,069 Node[0] Epoch[1] Train-top_k_accuracy_5=0.973047
2016-11-04 18:08:57,071 Node[0] Epoch[1] Train-top_k_accuracy_10=1.000000
2016-11-04 18:08:57,072 Node[0] Epoch[1] Train-top_k_accuracy_20=1.000000
2016-11-04 18:08:57,072 Node[0] Epoch[1] Time cost=99.614
2016-11-04 18:09:02,598 Node[0] Epoch[1] Validation-accuracy=nan
2016-11-04 18:09:02,599 Node[0] Epoch[1] Validation-top_k_accuracy_5=nan
2016-11-04 18:09:02,599 Node[0] Epoch[1] Validation-top_k_accuracy_10=nan
2016-11-04 18:09:02,600 Node[0] Epoch[1] Validation-top_k_accuracy_20=nan
2016-11-04 18:10:42,219 Node[0] Epoch[2] Train-accuracy=0.742388
2016-11-04 18:10:42,221 Node[0] Epoch[2] Train-top_k_accuracy_5=0.981170
2016-11-04 18:10:42,222 Node[0] Epoch[2] Train-top_k_accuracy_10=1.000000
2016-11-04 18:10:42,223 Node[0] Epoch[2] Train-top_k_accuracy_20=1.000000
2016-11-04 18:10:42,223 Node[0] Epoch[2] Time cost=99.622
2016-11-04 18:10:47,755 Node[0] Epoch[2] Validation-accuracy=nan
2016-11-04 18:10:47,756 Node[0] Epoch[2] Validation-top_k_accuracy_5=nan
2016-11-04 18:10:47,757 Node[0] Epoch[2] Validation-top_k_accuracy_10=nan
2016-11-04 18:10:47,758 Node[0] Epoch[2] Validation-top_k_accuracy_20=nan
2016-11-04 18:12:27,771 Node[0] Epoch[3] Train-accuracy=0.775586
2016-11-04 18:12:27,772 Node[0] Epoch[3] Train-top_k_accuracy_5=0.987891
2016-11-04 18:12:27,773 Node[0] Epoch[3] Train-top_k_accuracy_10=1.000000
2016-11-04 18:12:27,774 Node[0] Epoch[3] Train-top_k_accuracy_20=1.000000
2016-11-04 18:12:27,775 Node[0] Epoch[3] Time cost=100.017
2016-11-04 18:12:33,304 Node[0] Epoch[3] Validation-accuracy=nan
2016-11-04 18:12:33,305 Node[0] Epoch[3] Validation-top_k_accuracy_5=nan
2016-11-04 18:12:33,306 Node[0] Epoch[3] Validation-top_k_accuracy_10=nan
2016-11-04 18:12:33,307 Node[0] Epoch[3] Validation-top_k_accuracy_20=nan
2016-11-04 18:14:13,252 Node[0] Epoch[4] Train-accuracy=0.793945
2016-11-04 18:14:13,253 Node[0] Epoch[4] Train-top_k_accuracy_5=0.991016
2016-11-04 18:14:13,253 Node[0] Epoch[4] Train-top_k_accuracy_10=1.000000
2016-11-04 18:14:13,254 Node[0] Epoch[4] Train-top_k_accuracy_20=1.000000
2016-11-04 18:14:13,255 Node[0] Epoch[4] Time cost=99.948
2016-11-04 18:14:18,787 Node[0] Epoch[4] Validation-accuracy=nan
2016-11-04 18:14:18,787 Node[0] Epoch[4] Validation-top_k_accuracy_5=nan
2016-11-04 18:14:18,788 Node[0] Epoch[4] Validation-top_k_accuracy_10=nan
2016-11-04 18:14:18,789 Node[0] Epoch[4] Validation-top_k_accuracy_20=nan
2016-11-04 18:15:58,508 Node[0] Epoch[5] Train-accuracy=0.819311
2016-11-04 18:15:58,509 Node[0] Epoch[5] Train-top_k_accuracy_5=0.991987
2016-11-04 18:15:58,510 Node[0] Epoch[5] Train-top_k_accuracy_10=1.000000
2016-11-04 18:15:58,511 Node[0] Epoch[5] Train-top_k_accuracy_20=1.000000
2016-11-04 18:15:58,511 Node[0] Epoch[5] Time cost=99.722
2016-11-04 18:16:04,043 Node[0] Epoch[5] Validation-accuracy=nan
2016-11-04 18:16:04,044 Node[0] Epoch[5] Validation-top_k_accuracy_5=nan
2016-11-04 18:16:04,045 Node[0] Epoch[5] Validation-top_k_accuracy_10=nan
2016-11-04 18:16:04,046 Node[0] Epoch[5] Validation-top_k_accuracy_20=nan
2016-11-04 18:17:43,899 Node[0] Epoch[6] Train-accuracy=0.829688
2016-11-04 18:17:43,900 Node[0] Epoch[6] Train-top_k_accuracy_5=0.993750
2016-11-04 18:17:43,901 Node[0] Epoch[6] Train-top_k_accuracy_10=1.000000
2016-11-04 18:17:43,902 Node[0] Epoch[6] Train-top_k_accuracy_20=1.000000
2016-11-04 18:17:43,903 Node[0] Epoch[6] Time cost=99.856
2016-11-04 18:17:49,441 Node[0] Epoch[6] Validation-accuracy=nan
2016-11-04 18:17:49,441 Node[0] Epoch[6] Validation-top_k_accuracy_5=nan
2016-11-04 18:17:49,442 Node[0] Epoch[6] Validation-top_k_accuracy_10=nan
2016-11-04 18:17:49,443 Node[0] Epoch[6] Validation-top_k_accuracy_20=nan
2016-11-04 18:19:29,163 Node[0] Epoch[7] Train-accuracy=0.844151
2016-11-04 18:19:29,164 Node[0] Epoch[7] Train-top_k_accuracy_5=0.994992
2016-11-04 18:19:29,165 Node[0] Epoch[7] Train-top_k_accuracy_10=1.000000
2016-11-04 18:19:29,166 Node[0] Epoch[7] Train-top_k_accuracy_20=1.000000
2016-11-04 18:19:29,167 Node[0] Epoch[7] Time cost=99.723
2016-11-04 18:19:34,688 Node[0] Epoch[7] Validation-accuracy=nan
2016-11-04 18:19:34,689 Node[0] Epoch[7] Validation-top_k_accuracy_5=nan
2016-11-04 18:19:34,689 Node[0] Epoch[7] Validation-top_k_accuracy_10=nan
2016-11-04 18:19:34,690 Node[0] Epoch[7] Validation-top_k_accuracy_20=nan
2016-11-04 18:21:15,117 Node[0] Epoch[8] Train-accuracy=0.862695
2016-11-04 18:21:15,118 Node[0] Epoch[8] Train-top_k_accuracy_5=0.995703
2016-11-04 18:21:15,118 Node[0] Epoch[8] Train-top_k_accuracy_10=1.000000
2016-11-04 18:21:15,120 Node[0] Epoch[8] Train-top_k_accuracy_20=1.000000
2016-11-04 18:21:15,121 Node[0] Epoch[8] Time cost=100.430
2016-11-04 18:21:21,087 Node[0] Epoch[8] Validation-accuracy=nan
2016-11-04 18:21:21,088 Node[0] Epoch[8] Validation-top_k_accuracy_5=nan
2016-11-04 18:21:21,088 Node[0] Epoch[8] Validation-top_k_accuracy_10=nan
2016-11-04 18:21:21,089 Node[0] Epoch[8] Validation-top_k_accuracy_20=nan
2016-11-04 18:23:00,978 Node[0] Epoch[9] Train-accuracy=0.872070
2016-11-04 18:23:00,979 Node[0] Epoch[9] Train-top_k_accuracy_5=0.994141
2016-11-04 18:23:00,981 Node[0] Epoch[9] Train-top_k_accuracy_10=1.000000
2016-11-04 18:23:00,982 Node[0] Epoch[9] Train-top_k_accuracy_20=1.000000
2016-11-04 18:23:00,983 Node[0] Epoch[9] Time cost=99.893
2016-11-04 18:23:06,510 Node[0] Epoch[9] Validation-accuracy=nan
2016-11-04 18:23:06,511 Node[0] Epoch[9] Validation-top_k_accuracy_5=nan
2016-11-04 18:23:06,511 Node[0] Epoch[9] Validation-top_k_accuracy_10=nan
2016-11-04 18:23:06,512 Node[0] Epoch[9] Validation-top_k_accuracy_20=nan
2016-11-04 18:24:46,159 Node[0] Epoch[10] Train-accuracy=0.878606
2016-11-04 18:24:46,160 Node[0] Epoch[10] Train-top_k_accuracy_5=0.997196
2016-11-04 18:24:46,161 Node[0] Epoch[10] Train-top_k_accuracy_10=1.000000
2016-11-04 18:24:46,162 Node[0] Epoch[10] Train-top_k_accuracy_20=1.000000
2016-11-04 18:24:46,163 Node[0] Epoch[10] Time cost=99.646
2016-11-04 18:24:51,684 Node[0] Epoch[10] Validation-accuracy=nan
2016-11-04 18:24:51,685 Node[0] Epoch[10] Validation-top_k_accuracy_5=nan
2016-11-04 18:24:51,685 Node[0] Epoch[10] Validation-top_k_accuracy_10=nan
2016-11-04 18:24:51,686 Node[0] Epoch[10] Validation-top_k_accuracy_20=nan
2016-11-04 18:26:31,614 Node[0] Epoch[11] Train-accuracy=0.889062
2016-11-04 18:26:31,615 Node[0] Epoch[11] Train-top_k_accuracy_5=0.996094
2016-11-04 18:26:31,616 Node[0] Epoch[11] Train-top_k_accuracy_10=1.000000
2016-11-04 18:26:31,617 Node[0] Epoch[11] Train-top_k_accuracy_20=1.000000
2016-11-04 18:26:31,618 Node[0] Epoch[11] Time cost=99.931
2016-11-04 18:26:37,150 Node[0] Epoch[11] Validation-accuracy=nan
2016-11-04 18:26:37,151 Node[0] Epoch[11] Validation-top_k_accuracy_5=nan
2016-11-04 18:26:37,152 Node[0] Epoch[11] Validation-top_k_accuracy_10=nan
2016-11-04 18:26:37,152 Node[0] Epoch[11] Validation-top_k_accuracy_20=nan
2016-11-04 18:28:17,021 Node[0] Epoch[12] Train-accuracy=0.895117
2016-11-04 18:28:17,023 Node[0] Epoch[12] Train-top_k_accuracy_5=0.997266
2016-11-04 18:28:17,024 Node[0] Epoch[12] Train-top_k_accuracy_10=1.000000
2016-11-04 18:28:17,025 Node[0] Epoch[12] Train-top_k_accuracy_20=1.000000
2016-11-04 18:28:17,026 Node[0] Epoch[12] Time cost=99.873
2016-11-04 18:28:22,553 Node[0] Epoch[12] Validation-accuracy=nan
2016-11-04 18:28:22,554 Node[0] Epoch[12] Validation-top_k_accuracy_5=nan
2016-11-04 18:28:22,555 Node[0] Epoch[12] Validation-top_k_accuracy_10=nan
2016-11-04 18:28:22,556 Node[0] Epoch[12] Validation-top_k_accuracy_20=nan
2016-11-04 18:30:02,235 Node[0] Epoch[13] Train-accuracy=0.904447
2016-11-04 18:30:02,236 Node[0] Epoch[13] Train-top_k_accuracy_5=0.997997
2016-11-04 18:30:02,237 Node[0] Epoch[13] Train-top_k_accuracy_10=1.000000
2016-11-04 18:30:02,238 Node[0] Epoch[13] Train-top_k_accuracy_20=1.000000
2016-11-04 18:30:02,239 Node[0] Epoch[13] Time cost=99.683
2016-11-04 18:30:07,772 Node[0] Epoch[13] Validation-accuracy=nan
2016-11-04 18:30:07,773 Node[0] Epoch[13] Validation-top_k_accuracy_5=nan
2016-11-04 18:30:07,774 Node[0] Epoch[13] Validation-top_k_accuracy_10=nan
2016-11-04 18:30:07,774 Node[0] Epoch[13] Validation-top_k_accuracy_20=nan
2016-11-04 18:31:47,692 Node[0] Epoch[14] Train-accuracy=0.902734
2016-11-04 18:31:47,693 Node[0] Epoch[14] Train-top_k_accuracy_5=0.998633
2016-11-04 18:31:47,694 Node[0] Epoch[14] Train-top_k_accuracy_10=1.000000
2016-11-04 18:31:47,695 Node[0] Epoch[14] Train-top_k_accuracy_20=1.000000
2016-11-04 18:31:47,696 Node[0] Epoch[14] Time cost=99.921
2016-11-04 18:31:53,226 Node[0] Epoch[14] Validation-accuracy=nan
2016-11-04 18:31:53,227 Node[0] Epoch[14] Validation-top_k_accuracy_5=nan
2016-11-04 18:31:53,228 Node[0] Epoch[14] Validation-top_k_accuracy_10=nan
2016-11-04 18:31:53,228 Node[0] Epoch[14] Validation-top_k_accuracy_20=nan
2016-11-04 18:33:32,868 Node[0] Epoch[15] Train-accuracy=0.919471
2016-11-04 18:33:32,869 Node[0] Epoch[15] Train-top_k_accuracy_5=0.998798
2016-11-04 18:33:32,870 Node[0] Epoch[15] Train-top_k_accuracy_10=1.000000
2016-11-04 18:33:32,871 Node[0] Epoch[15] Train-top_k_accuracy_20=1.000000
2016-11-04 18:33:32,872 Node[0] Epoch[15] Time cost=99.643
2016-11-04 18:33:38,396 Node[0] Epoch[15] Validation-accuracy=nan
2016-11-04 18:33:38,397 Node[0] Epoch[15] Validation-top_k_accuracy_5=nan
2016-11-04 18:33:38,398 Node[0] Epoch[15] Validation-top_k_accuracy_10=nan
2016-11-04 18:33:38,398 Node[0] Epoch[15] Validation-top_k_accuracy_20=nan
2016-11-04 18:35:18,826 Node[0] Epoch[16] Train-accuracy=0.915234
2016-11-04 18:35:18,827 Node[0] Epoch[16] Train-top_k_accuracy_5=0.997656
2016-11-04 18:35:18,828 Node[0] Epoch[16] Train-top_k_accuracy_10=1.000000
2016-11-04 18:35:18,829 Node[0] Epoch[16] Train-top_k_accuracy_20=1.000000
2016-11-04 18:35:18,830 Node[0] Epoch[16] Time cost=100.431
2016-11-04 18:35:24,759 Node[0] Epoch[16] Validation-accuracy=nan
2016-11-04 18:35:24,759 Node[0] Epoch[16] Validation-top_k_accuracy_5=nan
2016-11-04 18:35:24,760 Node[0] Epoch[16] Validation-top_k_accuracy_10=nan
2016-11-04 18:35:24,761 Node[0] Epoch[16] Validation-top_k_accuracy_20=nan
2016-11-04 18:37:04,619 Node[0] Epoch[17] Train-accuracy=0.918555
2016-11-04 18:37:04,620 Node[0] Epoch[17] Train-top_k_accuracy_5=0.998437
2016-11-04 18:37:04,621 Node[0] Epoch[17] Train-top_k_accuracy_10=1.000000
2016-11-04 18:37:04,621 Node[0] Epoch[17] Train-top_k_accuracy_20=1.000000
2016-11-04 18:37:04,622 Node[0] Epoch[17] Time cost=99.861
2016-11-04 18:37:10,154 Node[0] Epoch[17] Validation-accuracy=nan
2016-11-04 18:37:10,155 Node[0] Epoch[17] Validation-top_k_accuracy_5=nan
2016-11-04 18:37:10,156 Node[0] Epoch[17] Validation-top_k_accuracy_10=nan
2016-11-04 18:37:10,156 Node[0] Epoch[17] Validation-top_k_accuracy_20=nan
2016-11-04 18:38:49,811 Node[0] Epoch[18] Train-accuracy=0.924479
2016-11-04 18:38:49,812 Node[0] Epoch[18] Train-top_k_accuracy_5=0.998598
2016-11-04 18:38:49,813 Node[0] Epoch[18] Train-top_k_accuracy_10=1.000000
2016-11-04 18:38:49,814 Node[0] Epoch[18] Train-top_k_accuracy_20=1.000000
2016-11-04 18:38:49,814 Node[0] Epoch[18] Time cost=99.658
2016-11-04 18:38:55,339 Node[0] Epoch[18] Validation-accuracy=nan
2016-11-04 18:38:55,340 Node[0] Epoch[18] Validation-top_k_accuracy_5=nan
2016-11-04 18:38:55,340 Node[0] Epoch[18] Validation-top_k_accuracy_10=nan
2016-11-04 18:38:55,341 Node[0] Epoch[18] Validation-top_k_accuracy_20=nan
2016-11-04 18:40:35,220 Node[0] Epoch[19] Train-accuracy=0.927148
2016-11-04 18:40:35,222 Node[0] Epoch[19] Train-top_k_accuracy_5=0.998828
2016-11-04 18:40:35,222 Node[0] Epoch[19] Train-top_k_accuracy_10=1.000000
2016-11-04 18:40:35,223 Node[0] Epoch[19] Train-top_k_accuracy_20=1.000000
2016-11-04 18:40:35,224 Node[0] Epoch[19] Time cost=99.880
2016-11-04 18:40:40,751 Node[0] Epoch[19] Validation-accuracy=nan
2016-11-04 18:40:40,752 Node[0] Epoch[19] Validation-top_k_accuracy_5=nan
2016-11-04 18:40:40,753 Node[0] Epoch[19] Validation-top_k_accuracy_10=nan
2016-11-04 18:40:40,753 Node[0] Epoch[19] Validation-top_k_accuracy_20=nan

In [ ]: