Get patients image size and mask boundings into CSV

  • Each script uses only a single GPU, so we will distribute patients among shards in order to distribute or paralellize execution
  • For each shard, duplicate this script, set a unique SHARD_ID and execute it
  • This script:
    • Creates a directory named "patients-[shard_id]" with all its results
    • Creates a file "patients-analysis.csv" with all imaging analysis data
    • Snapshots 3 slides for each patient to directory "samples"

In [308]:
#only (patient_id%NR_SHARDS) == SHARD_ID will be processed here
#choose a value between 1-4
SHARD_ID = 1
NR_SHARDS = 4

#Patient DICOM images folder
INPUT_FOLDER = '../../input/sample_images/'
OUTPUT_FOLDER = '../../output/' + str(SHARD_ID) + '/'

In [309]:
%matplotlib inline

import numpy as np # linear algebra
from numpy import ndarray
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statistics
import csv
import dicom
from time import time
import os
import shutil
import scipy.ndimage
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import scipy.ndimage as ndimage
import itertools
from itertools import product, combinations
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection

In [310]:
class Timer:
    def __init__(self, name, debug=True):
        self._name = name
        self._debug = debug
        self.start()
    
    def start(self):
        self._start = time()
        if(self._debug):
            log('> [started] ' + self._name + '...')

    def stop(self):
        self._lastElapsed = (time()-self._start)
        if(self._debug):
            log('> [done]    {} ({:.3f} ms)'.format(self._name, self._lastElapsed*1000))
            
    def elapsed(self):
        if(self._lastElapsed != None):
            return (self._lastElapsed)
        else:
            return (time()-self._start)

In [311]:
import datetime
def log(message):
    print('{} {}'.format(datetime.datetime.now(), message))

In [337]:
def get_patient_ids(shard_id, input_folder):
  shard_patients = []
  patients = os.listdir(input_folder)
  patients.sort()
  for p in patients:
    if(int(p,16)%NR_SHARDS == (shard_id-1)):
      shard_patients.append(p)
  return shard_patients

In [313]:
# Load the scans in given folder path
#image pixels dimensions: z, y, x
def load_scan(path):
    t = Timer('load_scan ' + path)
    
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.slice_thickness = slice_thickness

    t.stop()
    return slices

In [326]:
#image pixels dimensions: z, y, x
def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [327]:
#image pixels dimensions: z, y, x
def resample(image, scan, new_spacing=[1,1,1]):
    t = Timer('resample')
    # Determine current pixel spacing
    spacing = np.array([scan[0].slice_thickness] + scan[0].PixelSpacing, dtype=np.float32)

    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    t.stop()
    
    return image, new_spacing

In [328]:
def largest_label_volume(im, bg=-1):
    vals, counts = np.unique(im, return_counts=True)

    counts = counts[vals != bg]
    vals = vals[vals != bg]

    if len(counts) > 0:
        return vals[np.argmax(counts)]
    else:
        return None

In [329]:
def segment_lung_mask(image, fill_lung_structures=True):
    t = Timer('segment_lung_mask')

    # 0 is treated as background, which we do not want
    binary_image = np.array(image > -320, dtype=np.int8)+1
    labels = measure.label(binary_image)

    # Pick the pixel in the very corner to determine which label is air.
    #   Improvement: Pick multiple background labels from around the patient
    #   More resistant to "trays" on which the patient lays cutting the air 
    #   around the person in half
    background_label = labels[0,0,0] 
    
    #Fill the air around the person
    binary_image[background_label == labels] = 2

    # Method of filling the lung structures (that is superior to something like 
    # morphological closing)
    if fill_lung_structures:
        # For every slice we determine the largest solid structure
        for i, axial_slice in enumerate(binary_image):
            axial_slice = axial_slice - 1
            labeling = measure.label(axial_slice)
            l_max = largest_label_volume(labeling, bg=0)
            
            if l_max is not None: #This slice contains some lung
                binary_image[i][labeling != l_max] = 1
    
    binary_image -= 1 #Make the image actual binary
    binary_image = 1-binary_image # Invert it, lungs are now 1
    
    # Remove other air pockets insided body
    labels = measure.label(binary_image, background=0)
    
    l_max = largest_label_volume(labels, bg=0)
    if l_max is not None: # There are air pockets
        binary_image[labels != l_max] = 0

    #dilate mask
    binary_image = scipy.ndimage.morphology.grey_dilation(binary_image, size=(10,10,10))
    t.stop()
    
    return binary_image

In [330]:
#returns ((x1, y1, z1), (x2, y2, z2))
def bounding_box(img):
    N = img.ndim
    out = []
    for ax in itertools.combinations(range(N), N - 1):
        nonzero = np.any(img, axis=ax)
        out.extend(np.where(nonzero)[0][[0, -1]])
    r = np.reshape(np.asarray(tuple(out)), (-1, 2)).T
    return [tuple(r[0]), tuple(r[1])]

In [331]:
#return bounding box center in (x,y,z)
def bounding_box_center(bounds):
    return (int(round((bounds[0][0] + (bounds[1][0]-bounds[0][0])/2))), int(round((bounds[0][1] + (bounds[1][1]-bounds[0][1])/2))), int(round((bounds[0][2] + (bounds[1][2]-bounds[0][2])/2))))

In [332]:
def generate_slice_shot(patient_pixels, patient_lung_mask, patient_id, slice_pos, output_dir):
    t = Timer('generate_slice_shot ' + str(slice_pos))
    fig1, ax1 = plt.subplots(1)
    fig1.set_size_inches(6,6)

    masked_img = np.ma.masked_where(patient_lung_mask[slice_pos]==0, patient_pixels[slice_pos])
    ax1.imshow(masked_img, cmap=plt.cm.gray)

    file = output_dir + patient_id + '-' + 'slice-' + str(slice_pos) + '.jpg'
    plt.savefig(file)
        
    plt.close(fig1)
#    plt.show()
    t.stop()

In [333]:
def generate_patient_info(patient_pixels, patient_lung_mask, patient_scan, patient_id, append_to_csv_file):
    t = Timer('generate_patient_info')
    info = []
    
    #patient_id
    info.append(patient_id)
    
    #image w,h,d
    info.append(np.shape(patient_pixels)[2])
    info.append(np.shape(patient_pixels)[1])
    info.append(np.shape(patient_pixels)[0])
    
    #image volume mean
    t1 = Timer('flatten pixels')
    data = list(ndarray.flatten(patient_pixels))
    t1.stop()
    t1 = Timer('calc mean')
    info.append(statistics.mean(data))
    t1.stop()
    
    #slice original scan qtty,thickness
    info.append(len(patient_scan))
    info.append(patient_scan[0].slice_thickness)
    
    #mask cx,cy,cz,w,h,d
    box = bounding_box(patient_lung_mask)
    box_center = bounding_box_center(box)
    info.append(box_center[0])
    info.append(box_center[1])
    info.append(box_center[2])
    info.append(box[1][0]-box[0][0])
    info.append(box[1][1]-box[0][1])
    info.append(box[1][2]-box[0][2])
    
    #append data to csv file
    with open(append_to_csv_file, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(info)
        
    t.stop()

In [334]:
def process_patient(input_dir, patient_id, output_shots_dir, output_csv_file):
    patient_dir = input_dir + patient_id
    patient_scan = load_scan(patient_dir)
    patient_pixels = get_pixels_hu(patient_scan)
    patient_pixels_resampled, spacing = resample(patient_pixels, patient_scan, [1,1,1])
    patient_lung_mask = segment_lung_mask(patient_pixels_resampled, True)
    
    generate_patient_info(patient_pixels_resampled, patient_lung_mask, patient_scan, patient_id, output_csv_file)
    
    ln = np.shape(patient_pixels_resampled)[0]
    generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4), output_shots_dir)
    generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*2), output_shots_dir)
    generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*3), output_shots_dir)

In [335]:
def start_processing(input_dir, shard_id, max_patients, output_dir):
    log('Processing patients. shard_id=' + str(shard_id) + ' max_patients='+ str(max_patients) + ' input_dir=' + input_dir + ' output_dir=' + output_dir)
    patient_ids = get_patient_ids(shard_id, input_dir)
    log('Number of patients: ' + str(len(patient_ids)))
    patients_count = 0
    shutil.rmtree(output_dir, True)
    try:
        os.makedirs(output_dir)
    except:
        pass
    
    for patient_id in patient_ids:
        patients_count = patients_count + 1
        if(patients_count>max_patients):
            break
        t = Timer('>>> PATIENT PROCESSING ' + patient_id + ' (count=' + str(patients_count) + '; output_dir=' + output_dir + ')')
        process_patient(input_dir, patient_id, output_dir + 'shots/', output_dir + 'patients.csv')
        t.stop()
        print('')

In [336]:
start_processing(INPUT_FOLDER, SHARD_ID, 2, OUTPUT_FOLDER)


2017-02-11 21:45:48.116358 Processing patients. shard_id=0 max_patients=2 input_dir=../../input/sample_images/ output_dir=../../output/0/
2017-02-11 21:45:48.117019 Number of patients: 8
2017-02-11 21:45:48.117175 > [started] >>> PATIENT PROCESSING 0a0c32c9e08cc2ea76a71649de56be6d (count=1; output_dir=../../output/0/)...
2017-02-11 21:45:48.117212 > [started] load_scan ../../input/sample_images/0a0c32c9e08cc2ea76a71649de56be6d...
2017-02-11 21:45:48.441553 > [done]    load_scan ../../input/sample_images/0a0c32c9e08cc2ea76a71649de56be6d (324.326 ms)
2017-02-11 21:45:48.591113 > [started] resample...
2017-02-11 21:46:06.796787 > [done]    resample (18205.633 ms)
2017-02-11 21:46:06.797046 > [started] segment_lung_mask...
2017-02-11 21:46:13.502048 > [done]    segment_lung_mask (6704.965 ms)
2017-02-11 21:46:13.515391 > [started] generate_patient_info...
2017-02-11 21:46:13.515476 > [started] flatten pixels...
2017-02-11 21:46:15.237238 > [done]    flatten pixels (1721.740 ms)
2017-02-11 21:46:15.237429 > [started] calc mean...
2017-02-11 21:46:39.201633 > [done]    calc mean (23964.194 ms)
2017-02-11 21:46:39.277857 > [done]    generate_patient_info (25762.457 ms)
2017-02-11 21:46:39.604787 > [started] generate_slice_shot 83...

NameErrorTraceback (most recent call last)
<ipython-input-336-21303b130ce6> in <module>()
----> 1 start_processing(INPUT_FOLDER, SHARD_ID, 2, OUTPUT_FOLDER)

<ipython-input-335-eb50fa7eac14> in start_processing(input_dir, shard_id, max_patients, output_dir)
     15             break
     16         t = Timer('>>> PATIENT PROCESSING ' + patient_id + ' (count=' + str(patients_count) + '; output_dir=' + output_dir + ')')
---> 17         process_patient(input_dir, patient_id, output_dir + 'shots/', output_dir + 'patients.csv')
     18         t.stop()
     19         print('')

<ipython-input-334-96b7031f0ca7> in process_patient(input_dir, patient_id, output_shots_dir, output_csv_file)
      9 
     10     ln = np.shape(patient_pixels_resampled)[0]
---> 11     generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4), output_shots_dir)
     12     generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*2), output_shots_dir)
     13     generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*3), output_shots_dir)

<ipython-input-332-d3e413251b5b> in generate_slice_shot(patient_pixels, patient_lung_mask, patient_id, slice_pos, output_dir)
      7     ax1.imshow(masked_img, cmap=plt.cm.gray)
      8 
----> 9     plt.savefig(file)
     10 
     11     plt.close(fig1)

NameError: name 'file' is not defined

In [256]:



2017-02-11 21:23:05.759082 test

In [ ]:


In [ ]:


In [ ]: