In [2]:
#from __future__ import print_function

import os
import numpy as np
import dicom
from scipy.misc import imresize

In [17]:
RAWDATA = '/storage/hpc_dmytro/Kaggle/SDSB/images/raw/'
PREPROCDATA = '/storage/hpc_dmytro/Kaggle/SDSB/images/keras_size64/'

def crop_resize(img):
    """
    Crop center and resize.
    :param img: image to be cropped and resized.
    """
    if img.shape[0] < img.shape[1]:
        img = img.T
    # we crop image from center
    short_edge = min(img.shape[:2])
    yy = int((img.shape[0] - short_edge) / 2)
    xx = int((img.shape[1] - short_edge) / 2)
    crop_img = img[yy: yy + short_edge, xx: xx + short_edge]
    img = crop_img
    img = imresize(img, img_shape, )
    return img


def load_images(from_dir, verbose=True):
    """
    Load images in the form study x slices x width x height.
    Each image contains 30 time series frames so that it is ready for the convolutional network.
    :param from_dir: directory with images (train or validate)
    :param verbose: if true then print data
    """
    print('-'*50)
    print('Loading all DICOM images from {0}...'.format(from_dir))
    print('-'*50)

    current_study_sub = ''  # saves the current study sub_folder
    current_study = ''  # saves the current study folder
    current_study_images = []  # holds current study images
    ids = []  # keeps the ids of the studies
    study_to_images = dict()  # dictionary for studies to images
    total = 0
    images = []  # saves 30-frame-images
    from_dir = from_dir if from_dir.endswith('/') else from_dir + '/'
    for subdir, _, files in os.walk(from_dir):
        
        # Debugging
        #if total == 1000:
        #    break
        subdir = subdir.replace('\\', '/')  # windows path fix
        subdir_split = subdir.split('/')
        study_id = subdir_split[-3]
        if "sax" in subdir:
            for f in files:
                image_path = os.path.join(subdir, f)
                if not image_path.endswith('.dcm'):
                    continue

                image = dicom.read_file(image_path)
                image = image.pixel_array
                if img_resize:
                    image = crop_resize(image)

                if current_study_sub != subdir:
                    x = 0
                    try:
                        while len(images) < 30:
                            images.append(images[x])
                            x += 1
                        if len(images) > 30:
                            images = images[0:30]

                    except IndexError:
                        pass
                    current_study_sub = subdir
                    current_study_images.append(images)
                    images = []

                if current_study != study_id:
                    study_to_images[current_study] = np.array(current_study_images)
                    if current_study != "":
                        ids.append(current_study)
                    current_study = study_id
                    current_study_images = []
                images.append(image)
                if verbose:
                    if total % 1000 == 0:
                        print('Images processed {0}'.format(total))
                    # Debugging
                    #if total == 1000:
                    #    break
                total += 1
    x = 0
    try:
        while len(images) < 30:
            images.append(images[x])
            x += 1
        if len(images) > 30:
            images = images[0:30]
    except IndexError:
        pass

    print('-'*50)
    print('All DICOM in {0} images loaded.'.format(from_dir))
    print('-'*50)

    current_study_images.append(images)
    study_to_images[current_study] = np.array(current_study_images)
    if current_study != "":
        ids.append(current_study)

    return ids, study_to_images


def map_studies_results():
    """
    Maps studies to their respective targets.
    """
    id_to_results = dict()
    train_csv = open(RAWDATA + '/train.csv')
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        id, diastole, systole = item.replace('\n', '').split(',')
        id_to_results[id] = [float(diastole), float(systole)]

    return id_to_results


def write_train_npy():
    """
    Loads the training data set including X and y and saves it to .npy file.
    """
    print('-'*50)
    print('Writing training data to .npy file...')
    print('-'*50)

    study_ids, images = load_images(RAWDATA + '/train')  # load images and their ids
    studies_to_results = map_studies_results()  # load the dictionary of studies to targets
    X = []
    y = []

    for study_id in study_ids:
        study = images[study_id]
        outputs = studies_to_results[study_id]
        for i in range(study.shape[0]):
            X.append(study[i, :, :, :])
            y.append(outputs)

    X = np.array(X, dtype=np.uint8)
    y = np.array(y)
    np.save(PREPROCDATA + '/X_train.npy', X)
    np.save(PREPROCDATA + '/y_train.npy', y)
    print('Done.')


def write_validation_npy():
    """
    Loads the validation data set including X and study ids and saves it to .npy file.
    """
    print('-'*50)
    print('Writing validation data to .npy file...')
    print('-'*50)

    ids, images = load_images(RAWDATA + '/val')
    study_ids = []
    X = []

    for study_id in ids:
        study = images[study_id]
        for i in range(study.shape[0]):
            study_ids.append(study_id)
            X.append(study[i, :, :, :])

    X = np.array(X, dtype=np.uint8)
    np.save(PREPROCDATA + '/X_validate.npy', X)
    np.save(PREPROCDATA + '/ids_validate.npy', study_ids)
    print('Done.')

In [16]:
img_resize = True
img_shape = (64, 64)
write_train_npy()
write_validation_npy()


--------------------------------------------------
Writing training data to .npy file...
--------------------------------------------------
--------------------------------------------------
Loading all DICOM images from /storage/hpc_dmytro/Kaggle/SDSB/images/raw//train...
--------------------------------------------------
Images processed 0
Images processed 1000
--------------------------------------------------
All DICOM in /storage/hpc_dmytro/Kaggle/SDSB/images/raw//train/ images loaded.
--------------------------------------------------
Done.
--------------------------------------------------
Writing validation data to .npy file...
--------------------------------------------------
--------------------------------------------------
Loading all DICOM images from /storage/hpc_dmytro/Kaggle/SDSB/images/raw//val...
--------------------------------------------------
Images processed 0
Images processed 1000
--------------------------------------------------
All DICOM in /storage/hpc_dmytro/Kaggle/SDSB/images/raw//val/ images loaded.
--------------------------------------------------
Done.

In [ ]: