In [308]:
#only (patient_id%NR_SHARDS) == SHARD_ID will be processed here
#choose a value between 1-4
SHARD_ID = 1
NR_SHARDS = 4
#Patient DICOM images folder
INPUT_FOLDER = '../../input/sample_images/'
OUTPUT_FOLDER = '../../output/' + str(SHARD_ID) + '/'
In [309]:
%matplotlib inline
import numpy as np # linear algebra
from numpy import ndarray
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statistics
import csv
import dicom
from time import time
import os
import shutil
import scipy.ndimage
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import scipy.ndimage as ndimage
import itertools
from itertools import product, combinations
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
In [310]:
class Timer:
def __init__(self, name, debug=True):
self._name = name
self._debug = debug
self.start()
def start(self):
self._start = time()
if(self._debug):
log('> [started] ' + self._name + '...')
def stop(self):
self._lastElapsed = (time()-self._start)
if(self._debug):
log('> [done] {} ({:.3f} ms)'.format(self._name, self._lastElapsed*1000))
def elapsed(self):
if(self._lastElapsed != None):
return (self._lastElapsed)
else:
return (time()-self._start)
In [311]:
import datetime
def log(message):
print('{} {}'.format(datetime.datetime.now(), message))
In [337]:
def get_patient_ids(shard_id, input_folder):
shard_patients = []
patients = os.listdir(input_folder)
patients.sort()
for p in patients:
if(int(p,16)%NR_SHARDS == (shard_id-1)):
shard_patients.append(p)
return shard_patients
In [313]:
# Load the scans in given folder path
#image pixels dimensions: z, y, x
def load_scan(path):
t = Timer('load_scan ' + path)
slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.slice_thickness = slice_thickness
t.stop()
return slices
In [326]:
#image pixels dimensions: z, y, x
def get_pixels_hu(slices):
image = np.stack([s.pixel_array for s in slices])
# Convert to int16 (from sometimes int16),
# should be possible as values should always be low enough (<32k)
image = image.astype(np.int16)
# Set outside-of-scan pixels to 0
# The intercept is usually -1024, so air is approximately 0
image[image == -2000] = 0
# Convert to Hounsfield units (HU)
for slice_number in range(len(slices)):
intercept = slices[slice_number].RescaleIntercept
slope = slices[slice_number].RescaleSlope
if slope != 1:
image[slice_number] = slope * image[slice_number].astype(np.float64)
image[slice_number] = image[slice_number].astype(np.int16)
image[slice_number] += np.int16(intercept)
return np.array(image, dtype=np.int16)
In [327]:
#image pixels dimensions: z, y, x
def resample(image, scan, new_spacing=[1,1,1]):
t = Timer('resample')
# Determine current pixel spacing
spacing = np.array([scan[0].slice_thickness] + scan[0].PixelSpacing, dtype=np.float32)
resize_factor = spacing / new_spacing
new_real_shape = image.shape * resize_factor
new_shape = np.round(new_real_shape)
real_resize_factor = new_shape / image.shape
new_spacing = spacing / real_resize_factor
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
t.stop()
return image, new_spacing
In [328]:
def largest_label_volume(im, bg=-1):
vals, counts = np.unique(im, return_counts=True)
counts = counts[vals != bg]
vals = vals[vals != bg]
if len(counts) > 0:
return vals[np.argmax(counts)]
else:
return None
In [329]:
def segment_lung_mask(image, fill_lung_structures=True):
t = Timer('segment_lung_mask')
# 0 is treated as background, which we do not want
binary_image = np.array(image > -320, dtype=np.int8)+1
labels = measure.label(binary_image)
# Pick the pixel in the very corner to determine which label is air.
# Improvement: Pick multiple background labels from around the patient
# More resistant to "trays" on which the patient lays cutting the air
# around the person in half
background_label = labels[0,0,0]
#Fill the air around the person
binary_image[background_label == labels] = 2
# Method of filling the lung structures (that is superior to something like
# morphological closing)
if fill_lung_structures:
# For every slice we determine the largest solid structure
for i, axial_slice in enumerate(binary_image):
axial_slice = axial_slice - 1
labeling = measure.label(axial_slice)
l_max = largest_label_volume(labeling, bg=0)
if l_max is not None: #This slice contains some lung
binary_image[i][labeling != l_max] = 1
binary_image -= 1 #Make the image actual binary
binary_image = 1-binary_image # Invert it, lungs are now 1
# Remove other air pockets insided body
labels = measure.label(binary_image, background=0)
l_max = largest_label_volume(labels, bg=0)
if l_max is not None: # There are air pockets
binary_image[labels != l_max] = 0
#dilate mask
binary_image = scipy.ndimage.morphology.grey_dilation(binary_image, size=(10,10,10))
t.stop()
return binary_image
In [330]:
#returns ((x1, y1, z1), (x2, y2, z2))
def bounding_box(img):
N = img.ndim
out = []
for ax in itertools.combinations(range(N), N - 1):
nonzero = np.any(img, axis=ax)
out.extend(np.where(nonzero)[0][[0, -1]])
r = np.reshape(np.asarray(tuple(out)), (-1, 2)).T
return [tuple(r[0]), tuple(r[1])]
In [331]:
#return bounding box center in (x,y,z)
def bounding_box_center(bounds):
return (int(round((bounds[0][0] + (bounds[1][0]-bounds[0][0])/2))), int(round((bounds[0][1] + (bounds[1][1]-bounds[0][1])/2))), int(round((bounds[0][2] + (bounds[1][2]-bounds[0][2])/2))))
In [332]:
def generate_slice_shot(patient_pixels, patient_lung_mask, patient_id, slice_pos, output_dir):
t = Timer('generate_slice_shot ' + str(slice_pos))
fig1, ax1 = plt.subplots(1)
fig1.set_size_inches(6,6)
masked_img = np.ma.masked_where(patient_lung_mask[slice_pos]==0, patient_pixels[slice_pos])
ax1.imshow(masked_img, cmap=plt.cm.gray)
file = output_dir + patient_id + '-' + 'slice-' + str(slice_pos) + '.jpg'
plt.savefig(file)
plt.close(fig1)
# plt.show()
t.stop()
In [333]:
def generate_patient_info(patient_pixels, patient_lung_mask, patient_scan, patient_id, append_to_csv_file):
t = Timer('generate_patient_info')
info = []
#patient_id
info.append(patient_id)
#image w,h,d
info.append(np.shape(patient_pixels)[2])
info.append(np.shape(patient_pixels)[1])
info.append(np.shape(patient_pixels)[0])
#image volume mean
t1 = Timer('flatten pixels')
data = list(ndarray.flatten(patient_pixels))
t1.stop()
t1 = Timer('calc mean')
info.append(statistics.mean(data))
t1.stop()
#slice original scan qtty,thickness
info.append(len(patient_scan))
info.append(patient_scan[0].slice_thickness)
#mask cx,cy,cz,w,h,d
box = bounding_box(patient_lung_mask)
box_center = bounding_box_center(box)
info.append(box_center[0])
info.append(box_center[1])
info.append(box_center[2])
info.append(box[1][0]-box[0][0])
info.append(box[1][1]-box[0][1])
info.append(box[1][2]-box[0][2])
#append data to csv file
with open(append_to_csv_file, 'a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
writer.writerow(info)
t.stop()
In [334]:
def process_patient(input_dir, patient_id, output_shots_dir, output_csv_file):
patient_dir = input_dir + patient_id
patient_scan = load_scan(patient_dir)
patient_pixels = get_pixels_hu(patient_scan)
patient_pixels_resampled, spacing = resample(patient_pixels, patient_scan, [1,1,1])
patient_lung_mask = segment_lung_mask(patient_pixels_resampled, True)
generate_patient_info(patient_pixels_resampled, patient_lung_mask, patient_scan, patient_id, output_csv_file)
ln = np.shape(patient_pixels_resampled)[0]
generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4), output_shots_dir)
generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*2), output_shots_dir)
generate_slice_shot(patient_pixels_resampled, patient_lung_mask, patient_id, int(ln/4*3), output_shots_dir)
In [335]:
def start_processing(input_dir, shard_id, max_patients, output_dir):
log('Processing patients. shard_id=' + str(shard_id) + ' max_patients='+ str(max_patients) + ' input_dir=' + input_dir + ' output_dir=' + output_dir)
patient_ids = get_patient_ids(shard_id, input_dir)
log('Number of patients: ' + str(len(patient_ids)))
patients_count = 0
shutil.rmtree(output_dir, True)
try:
os.makedirs(output_dir)
except:
pass
for patient_id in patient_ids:
patients_count = patients_count + 1
if(patients_count>max_patients):
break
t = Timer('>>> PATIENT PROCESSING ' + patient_id + ' (count=' + str(patients_count) + '; output_dir=' + output_dir + ')')
process_patient(input_dir, patient_id, output_dir + 'shots/', output_dir + 'patients.csv')
t.stop()
print('')
In [336]:
start_processing(INPUT_FOLDER, SHARD_ID, 2, OUTPUT_FOLDER)
In [256]:
In [ ]:
In [ ]:
In [ ]: