In [1]:
#IMAGE_DIMS = (50,34,50,1)
IMAGE_DIMS = (224,152,224,1)
#IMAGE_DIMS = (112,76,112,1)
OUTPUT_DIR = '../../../output/kaggle-bowl/step10/'
#INPUT_DIR = '../../../input/step5-50/'
INPUT_DIR = '../../../input/step5-224/'
#INPUT_DIR = '../../../input/step5-112/'
DATASET_NAME_SUFFIX = '-centered-rotated'
LOAD_MODEL_FILE = None
#LOAD_MODEL_FILE = OUTPUT_DIR + 'tf-checkpoint-best5556'

In [2]:
import csv
import h5py
import numpy as np # linear algebra
import os
import logging
import tflearn

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
import modules.cnn as cnn
from modules.utils import Timer

In [3]:
def start_training(model, input_dir, dataset_name_suffix, image_dims, output_dir, batch_size=50, n_epoch=10):

    utils.mkdirs(output_dir, recreate=False)

    modules.logging.setup_file_logger(output_dir + 'out.log')
    
    dataset_path = utils.dataset_path(input_dir, 'train' + dataset_name_suffix, IMAGE_DIMS)
    with h5py.File(dataset_path, 'r') as train_hdf5:
        X = train_hdf5['X']
        Y = train_hdf5['Y']
        logger.info('X shape ' + str(X.shape))
        logger.info('Y shape ' + str(Y.shape))

        dataset_path = utils.dataset_path(input_dir, 'validate' + dataset_name_suffix, image_dims)
        with h5py.File(dataset_path, 'r') as validate_hdf5:
            X_validate = validate_hdf5['X']
            Y_validate = validate_hdf5['Y']
            logger.info('X_validate shape ' + str(X_validate.shape))
            logger.info('Y_validate shape ' + str(Y_validate.shape))

            logger.info('Starting CNN training...')
            model.fit(X, Y, validation_set=(X_validate, Y_validate), 
                      shuffle=True, batch_size=batch_size, n_epoch=n_epoch,
                      show_metric=True, 
                      run_id='simplest1-'+str(image_dims))

In [4]:
logger.info('Prepare CNN for training')
network = cnn.net_simplest1(IMAGE_DIMS)
#network = cnn.net_deepmedic_simple(IMAGE_DIMS)
# network = cnn.net_alzheimer_cnn(IMAGE_DIMS)
model = cnn.prepare_cnn_model(network, OUTPUT_DIR, model_file=None)


2017-03-09 01:22:08,942 INFO Prepare CNN for training
2017-03-09 01:22:09,028 INFO Prepare CNN
2017-03-09 01:22:09,029 INFO Preparing output dir
2017-03-09 01:22:09,029 INFO Initializing network...

In [5]:
logger.info('Train CNN')
start_training(model, INPUT_DIR, DATASET_NAME_SUFFIX, IMAGE_DIMS, OUTPUT_DIR, batch_size=8, n_epoch=5)
model.save(OUTPUT_DIR + 'final')
logger.info('==== ALL DONE ====')


2017-03-09 01:22:16,743 INFO Train CNN
2017-03-09 01:22:16,746 INFO X shape (538, 224, 152, 224, 1)
2017-03-09 01:22:16,747 INFO Y shape (538, 2)
2017-03-09 01:22:16,748 INFO X_validate shape (90, 224, 152, 224, 1)
2017-03-09 01:22:16,749 INFO Y_validate shape (90, 2)
2017-03-09 01:22:16,750 INFO Starting CNN training...
---------------------------------
Run id: simplest1-(224, 152, 224, 1)
Log directory: ../../../output/kaggle-bowl/step10/tf-logs/
INFO:tensorflow:Summary name Accuracy/ (raw) is illegal; using Accuracy/__raw_ instead.
2017-03-09 01:22:16,929 INFO Summary name Accuracy/ (raw) is illegal; using Accuracy/__raw_ instead.
---------------------------------
Training samples: 538
Validation samples: 90
--
---------------------------------------------------------------------------
ResourceExhaustedError                    Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1021     try:
-> 1022       return fn(*args)
   1023     except errors.OpError as e:

/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1003                                  feed_dict, fetch_list, target_list,
-> 1004                                  status, run_metadata)
   1005 

/usr/lib/python3.5/contextlib.py in __exit__(self, type, value, traceback)
     65             try:
---> 66                 next(self.gen)
     67             except StopIteration:

/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    468           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 469           pywrap_tensorflow.TF_GetCode(status))
    470   finally:

ResourceExhaustedError: OOM when allocating tensor with shape[8,8,224,152,224]
	 [[Node: Adam/gradients/MaxPool3D/MaxPool3D_grad/MaxPool3DGrad = MaxPool3DGrad[T=DT_FLOAT, ksize=[1, 2, 2, 2, 1], padding="SAME", strides=[1, 2, 2, 2, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](Conv3D/Relu, MaxPool3D/MaxPool3D, Adam/gradients/Conv3D_1/Conv3D_grad/Conv3DBackpropInputV2)]]

During handling of the above exception, another exception occurred:

ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-5-52c510550d23> in <module>()
      1 logger.info('Train CNN')
----> 2 start_training(model, INPUT_DIR, DATASET_NAME_SUFFIX, IMAGE_DIMS, OUTPUT_DIR, batch_size=8, n_epoch=5)
      3 model.save(OUTPUT_DIR + 'final')
      4 logger.info('==== ALL DONE ====')

<ipython-input-3-a58c4e3462d4> in start_training(model, input_dir, dataset_name_suffix, image_dims, output_dir, batch_size, n_epoch)
     23                       shuffle=True, batch_size=batch_size, n_epoch=n_epoch,
     24                       show_metric=True,
---> 25                       run_id='simplest1-'+str(image_dims))

/usr/local/lib/python3.5/dist-packages/tflearn/models/dnn.py in fit(self, X_inputs, Y_targets, n_epoch, validation_set, show_metric, batch_size, shuffle, snapshot_epoch, snapshot_step, excl_trainops, validation_batch_size, run_id, callbacks)
    213                          excl_trainops=excl_trainops,
    214                          run_id=run_id,
--> 215                          callbacks=callbacks)
    216 
    217     def predict(self, X):

/usr/local/lib/python3.5/dist-packages/tflearn/helpers/trainer.py in fit(self, feed_dicts, n_epoch, val_feed_dicts, show_metric, snapshot_step, snapshot_epoch, shuffle_all, dprep_dict, daug_dict, excl_trainops, run_id, callbacks)
    331                                                        (bool(self.best_checkpoint_path) | snapshot_epoch),
    332                                                        snapshot_step,
--> 333                                                        show_metric)
    334 
    335                             # Update training state

/usr/local/lib/python3.5/dist-packages/tflearn/helpers/trainer.py in _train(self, training_step, snapshot_epoch, snapshot_step, show_metric)
    772         tflearn.is_training(True, session=self.session)
    773         _, train_summ_str = self.session.run([self.train, self.summ_op],
--> 774                                              feed_batch)
    775 
    776         # Retrieve loss value from summary string

/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    765     try:
    766       result = self._run(None, fetches, feed_dict, options_ptr,
--> 767                          run_metadata_ptr)
    768       if run_metadata:
    769         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    963     if final_fetches or final_targets:
    964       results = self._do_run(handle, final_targets, final_fetches,
--> 965                              feed_dict_string, options, run_metadata)
    966     else:
    967       results = []

/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1013     if handle is None:
   1014       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015                            target_list, options, run_metadata)
   1016     else:
   1017       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1033         except KeyError:
   1034           pass
-> 1035       raise type(e)(node_def, op, message)
   1036 
   1037   def _extend_graph(self):

ResourceExhaustedError: OOM when allocating tensor with shape[8,8,224,152,224]
	 [[Node: Adam/gradients/MaxPool3D/MaxPool3D_grad/MaxPool3DGrad = MaxPool3DGrad[T=DT_FLOAT, ksize=[1, 2, 2, 2, 1], padding="SAME", strides=[1, 2, 2, 2, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](Conv3D/Relu, MaxPool3D/MaxPool3D, Adam/gradients/Conv3D_1/Conv3D_grad/Conv3DBackpropInputV2)]]

Caused by op 'Adam/gradients/MaxPool3D/MaxPool3D_grad/MaxPool3DGrad', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-df5ece654981>", line 5, in <module>
    model = cnn.prepare_cnn_model(network, OUTPUT_DIR, model_file=None)
  File "/notebooks/datascience-snippets/ipython-notebooks/kaggle-lung-cancer-detection/modules/cnn.py", line 164, in prepare_cnn_model
    best_checkpoint_path=dir_checkpoint_best)
  File "/usr/local/lib/python3.5/dist-packages/tflearn/models/dnn.py", line 64, in __init__
    best_val_accuracy=best_val_accuracy)
  File "/usr/local/lib/python3.5/dist-packages/tflearn/helpers/trainer.py", line 131, in __init__
    clip_gradients)
  File "/usr/local/lib/python3.5/dist-packages/tflearn/helpers/trainer.py", line 655, in initialize_training_ops
    self.grad = tf.gradients(total_loss, self.train_vars)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py", line 482, in gradients
    in_grads = grad_fn(op, *out_grads)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_grad.py", line 130, in _MaxPool3DGrad
    padding=op.get_attr("padding"))
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 1657, in max_pool3d_grad
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

...which was originally created as op 'MaxPool3D/MaxPool3D', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
[elided 18 identical lines from previous traceback]
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-df5ece654981>", line 4, in <module>
    network = cnn.net_alzheimer_cnn(IMAGE_DIMS)
  File "/notebooks/datascience-snippets/ipython-notebooks/kaggle-lung-cancer-detection/modules/cnn.py", line 73, in net_alzheimer_cnn
    net = layers.conv.max_pool_3d(net, [1,2,2,2,1], strides=[1,2,2,2,1])
  File "/usr/local/lib/python3.5/dist-packages/tflearn/layers/conv.py", line 959, in max_pool_3d
    inference = tf.nn.max_pool3d(incoming, kernel, strides, padding)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 1625, in max_pool3d
    strides=strides, padding=padding, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[8,8,224,152,224]
	 [[Node: Adam/gradients/MaxPool3D/MaxPool3D_grad/MaxPool3DGrad = MaxPool3DGrad[T=DT_FLOAT, ksize=[1, 2, 2, 2, 1], padding="SAME", strides=[1, 2, 2, 2, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](Conv3D/Relu, MaxPool3D/MaxPool3D, Adam/gradients/Conv3D_1/Conv3D_grad/Conv3DBackpropInputV2)]]

In [10]:
logger.info('Evaluate model from dataset')
dataset_path = utils.dataset_path(INPUT_DIR, 'test' + DATASET_NAME_SUFFIX, IMAGE_DIMS)
cnn.evaluate_dataset(dataset_path, model)


2017-03-08 22:12:24,576 INFO Evaluate model from dataset
2017-03-08 22:12:24,580 DEBUG X_test shape (764, 112, 76, 112, 1)
2017-03-08 22:12:24,581 DEBUG Y_test shape (764, 2)
2017-03-08 22:12:24,582 INFO Evaluate performance on dataset ../../../input/step5-112/test-centered-rotated-112-76-112.h5...
2017-03-08 22:12:57,432 INFO Accuracy: [0.058900524574424587]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: