In [6]:
#depth, height, width, channels
IMAGE_DIMS = (312, 212, 312, 1)

NR_SHARDS = 700

DATASET_NAME = 'data-centered-rotated'

INPUT_FOLDER = '../../../output/kaggle-bowl/step3/'
OUTPUT_FOLDER = '../../../output/kaggle-bowl/step4/'

In [7]:
import sys
import h5py
from random import shuffle
import numpy as np
from numpy import ndarray
import datetime
import logging

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
from modules.utils import Timer
import modules.logging

In [8]:
def start_processing(input_dir, nr_shards, image_dims, output_dir):
    logger.info('Merging shard results. nr_shards=' + str(nr_shards) + ' input_dir='+ str(input_dir) + ' output_dir=' + output_dir)
    
    t = Timer('Preparing output dir')
    utils.mkdirs(output_dir, dirs=['images'], recreate=True)

    modules.logging.setup_file_logger(output_dir + 'out.log')

    dataset_name = DATASET_NAME

    t = Timer('Count total patients among shards')
    total_patients = 0
    unusable_shards = []
    for shard_id in range(1,nr_shards+1):
        dataset_dir = input_dir + str(shard_id) + '/'
        dataset_file = utils.dataset_path(dataset_dir, dataset_name, image_dims)
        with h5py.File(dataset_file, 'r') as h5f:
            try:
                logger.info('shard_id={} shape={}'.format(shard_id,h5f['X'].shape))
                total_patients = total_patients + len(h5f['X'])
            except:
                logger.warning('no data on shard ' + str(shard_id))
                unusable_shards.append(shard_id)
                continue
        if(not utils.validate_dataset(dataset_dir, dataset_name, image_dims)):
            raise Exception('Validation ERROR!')
    t.stop()
            
    logger.info('total_patients=' + str(total_patients))

    t = Timer('Creating output merged dataset')
    output_dataset_file = utils.dataset_path(output_dir, dataset_name, image_dims)
    with h5py.File(output_dataset_file, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (total_patients, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), chunks=(1, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (total_patients, 2), dtype='f')

        logger.info('Merging shards')
        pb = 0
        for shard_id in range(1,nr_shards+1):
            if(shard_id in unusable_shards):
                logger.warning('skipping unusable shard ' + str(shard_id))
                continue
            ts = Timer('Processing shard' + str(shard_id))
            dataset_file = utils.dataset_path(input_dir + str(shard_id) + '/', dataset_name, image_dims)
            with h5py.File(dataset_file, 'r') as sh5f:
                shard_x_ds = sh5f['X']
                shard_y_ds = sh5f['Y']
                le = len(shard_x_ds)
                if(le>0):
                    pe = pb + le
                    logger.debug('output' + str(pb) + ' ' + str(pe) + ' input ' + str(0) + str(le))
                    x_ds[pb:pe] = shard_x_ds[0:le]
                    y_ds[pb:pe] = shard_y_ds[0:le]
                    pb = pe
                else:
                    logger.warning('shard ' + str(shard_id) + ' skipped because it has no data')
            ts.stop()
    t.stop()
    
    t = Timer('Output dataset validations')
    if(not utils.validate_dataset(output_dir, dataset_name, image_dims, save_dir=output_dir + 'images')):
        raise Exception('Validation ERROR!')
    t.stop()

In [9]:
logger.info('==== PROCESSING SHARDS MERGE ====')
start_processing(INPUT_FOLDER, NR_SHARDS, IMAGE_DIMS, OUTPUT_FOLDER)
logger.info('==== ALL DONE ====')


2017-03-12 23:46:32,969 INFO ==== PROCESSING SHARDS MERGE ====
2017-03-12 23:46:32,970 INFO Merging shard results. nr_shards=700 input_dir=../../../output/kaggle-bowl/step3/ output_dir=../../../output/kaggle-bowl/step4/
2017-03-12 23:46:32,971 INFO > [started] Preparing output dir...
2017-03-12 23:46:32,972 INFO > [started] Count total patients among shards...
2017-03-12 23:46:32,973 INFO shard_id=1 shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:32,974 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/1/data-centered-rotated-312-212-312.h5
2017-03-12 23:46:33,207 INFO Summary
2017-03-12 23:46:33,208 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,208 INFO Y shape=(3, 2)
2017-03-12 23:46:33,209 INFO Y: total: 3
2017-03-12 23:46:33,210 INFO Y: label 0: 3.0 100.0%
2017-03-12 23:46:33,210 INFO Y: label 1: 0.0 0.0%
2017-03-12 23:46:33,211 INFO Recording sample data
2017-03-12 23:46:33,212 INFO patient_index 0
2017-03-12 23:46:33,212 INFO x=
2017-03-12 23:46:33,213 INFO patient_index 1
2017-03-12 23:46:33,214 INFO x=
2017-03-12 23:46:33,214 INFO patient_index 2
2017-03-12 23:46:33,215 INFO x=
2017-03-12 23:46:33,216 INFO shard_id=2 shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,217 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/2/data-centered-rotated-312-212-312.h5
2017-03-12 23:46:33,451 INFO Summary
2017-03-12 23:46:33,452 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,453 INFO Y shape=(3, 2)
2017-03-12 23:46:33,454 INFO Y: total: 3
2017-03-12 23:46:33,454 INFO Y: label 0: 3.0 100.0%
2017-03-12 23:46:33,455 INFO Y: label 1: 0.0 0.0%
2017-03-12 23:46:33,456 INFO Recording sample data
2017-03-12 23:46:33,456 INFO patient_index 0
2017-03-12 23:46:33,457 INFO x=
2017-03-12 23:46:33,458 INFO patient_index 1
2017-03-12 23:46:33,458 INFO x=
2017-03-12 23:46:33,459 INFO patient_index 2
2017-03-12 23:46:33,460 INFO x=
2017-03-12 23:46:33,461 INFO shard_id=3 shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,462 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/3/data-centered-rotated-312-212-312.h5
2017-03-12 23:46:33,695 INFO Summary
2017-03-12 23:46:33,696 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,696 INFO Y shape=(3, 2)
2017-03-12 23:46:33,697 INFO Y: total: 3
2017-03-12 23:46:33,698 INFO Y: label 0: 2.0 66.6666666667%
2017-03-12 23:46:33,699 INFO Y: label 1: 1.0 33.3333333333%
2017-03-12 23:46:33,699 INFO Recording sample data
2017-03-12 23:46:33,700 INFO patient_index 0
2017-03-12 23:46:33,701 INFO x=
2017-03-12 23:46:33,701 INFO patient_index 1
2017-03-12 23:46:33,702 INFO x=
2017-03-12 23:46:33,703 INFO patient_index 2
2017-03-12 23:46:33,703 INFO x=
2017-03-12 23:46:33,705 INFO shard_id=4 shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,706 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/4/data-centered-rotated-312-212-312.h5
2017-03-12 23:46:33,940 INFO Summary
2017-03-12 23:46:33,941 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,942 INFO Y shape=(3, 2)
2017-03-12 23:46:33,942 INFO Y: total: 3
2017-03-12 23:46:33,943 INFO Y: label 0: 3.0 100.0%
2017-03-12 23:46:33,944 INFO Y: label 1: 0.0 0.0%
2017-03-12 23:46:33,945 INFO Recording sample data
2017-03-12 23:46:33,945 INFO patient_index 0
2017-03-12 23:46:33,946 INFO x=
2017-03-12 23:46:33,947 INFO patient_index 1
2017-03-12 23:46:33,947 INFO x=
2017-03-12 23:46:33,948 INFO patient_index 2
2017-03-12 23:46:33,949 INFO x=
2017-03-12 23:46:33,950 INFO shard_id=5 shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:33,951 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/5/data-centered-rotated-312-212-312.h5
2017-03-12 23:46:34,184 INFO Summary
2017-03-12 23:46:34,185 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:46:34,186 INFO Y shape=(3, 2)
2017-03-12 23:46:34,187 INFO Y: total: 3
2017-03-12 23:46:34,188 INFO Y: label 0: 2.0 66.6666666667%
2017-03-12 23:46:34,188 INFO Y: label 1: 1.0 33.3333333333%
2017-03-12 23:46:34,189 INFO Recording sample data
2017-03-12 23:46:34,190 INFO patient_index 0
2017-03-12 23:46:34,190 INFO x=
2017-03-12 23:46:34,191 INFO patient_index 1
2017-03-12 23:46:34,192 INFO x=
2017-03-12 23:46:34,192 INFO patient_index 2
2017-03-12 23:46:34,193 INFO x=
2017-03-12 23:46:34,194 WARNING no data on shard 6
2017-03-12 23:46:34,195 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/6/data-centered-rotated-312-212-312.h5
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-9-62ccd5fa6403> in <module>()
      1 logger.info('==== PROCESSING SHARDS MERGE ====')
----> 2 start_processing(INPUT_FOLDER, NR_SHARDS, IMAGE_DIMS, OUTPUT_FOLDER)
      3 logger.info('==== ALL DONE ====')

<ipython-input-8-b3100492d5d1> in start_processing(input_dir, nr_shards, image_dims, output_dir)
     22                 logger.warning('no data on shard ' + str(shard_id))
     23                 unusable_shards.append(shard_id)
---> 24         if(not utils.validate_dataset(dataset_dir, dataset_name, image_dims)):
     25             raise Exception('Validation ERROR!')
     26     t.stop()

/notebooks/datascience-snippets/ipython-notebooks/kaggle-lung-cancer-detection/modules/utils.py in validate_dataset(dataset_dir, name, image_dims, save_dir)
     53 
     54     with h5py.File(dataset_file, 'r') as h5f:
---> 55         x_ds = h5f['X']
     56         y_ds = h5f['Y']
     57 

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/tmp/pip-eeirwumi-build/h5py/_objects.c:2684)()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/tmp/pip-eeirwumi-build/h5py/_objects.c:2642)()

/usr/local/lib/python3.4/dist-packages/h5py/_hl/group.py in __getitem__(self, name)
    164                 raise ValueError("Invalid HDF5 object reference")
    165         else:
--> 166             oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    167 
    168         otype = h5i.get_type(oid)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/tmp/pip-eeirwumi-build/h5py/_objects.c:2684)()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/tmp/pip-eeirwumi-build/h5py/_objects.c:2642)()

h5py/h5o.pyx in h5py.h5o.open (/tmp/pip-eeirwumi-build/h5py/h5o.c:3570)()

KeyError: "Unable to open object (Object 'x' doesn't exist)"

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: