cf. https://www.kaggle.com/c/data-science-bowl-2017/data
I had to sudo dnf install p7zip and install p7zip (yet another software repo) just to extract the images.
data_password.txt.zip - contains decryption key for the image files; needed to extract. 9$kAsfpQ*FtH sample_images.7z - smaller subset of full dataset, provided for people who wish to preview images before large file stage1.7z, File size 781.39 MB
In [1]:
import os, sys
os.getcwd()
os.listdir( os.getcwd() )
Out[1]:
In [7]:
os.listdir( os.getcwd() + "/2017datascibowl/sample_images/")
Out[7]:
In [9]:
os.getcwd()
Out[9]:
In [2]:
%matplotlib inline
In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
import scipy.ndimage
import matplotlib.pyplot as plt
In [45]:
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
In [3]:
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()
In [13]:
patients
Out[13]:
In [14]:
# Load the scans in given folder path
def load_scan(path):
"""
INPUTS/ARGUMENTS
================
@type path : Python string
@type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset
each is a slice of the single patient's lung
"""
slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.SliceThickness = slice_thickness
return slices
In [69]:
image_test = np.stack([s.pixel_array for s in load_scan(INPUT_FOLDER+patients[0])])
In [71]:
print(image_test.shape);print(image_test.dtype)
In [72]:
pd.DataFrame( image_test[0]).describe()
Out[72]:
In [73]:
pd.DataFrame( image_test[1]).describe()
Out[73]:
In [74]:
len(image_test == -2000) # set outside-of-scan pixels to 0
Out[74]:
In [75]:
print( image_test.min() )
$N_x=512$, so $N_x\in \mathbb{Z}^+$
$N_y=512$, so $N_y\in \mathbb{Z}^+$
For each patient, $m_{\text{slice}} \in \mathbb{Z}^+$ represents the number of "slices", image "slices" of the single patient's lung.
So first make, for each patient, $(i_{\text{slice}}, P_{\text{arr}}) \in \lbrace 0,1,\dots m_{\text{slice}}-1 \rbrace \times \mathbb{K}^{N_x \times N_y } $
In [37]:
def get_pixels_hu(slices):
"""
INPUTS/ARGUMENTS
================
@type slices : Python list of dicom.dataset.FileDataset,
@param slices : each dicom.dataset.FileDataset representing an image "slice" of a single patient's lung
"""
image = np.stack([s.pixel_array for s in slices]) # np.array image.shape (134,512,512)
# Convert to int16 (from sometimes int16),
# should be possible as values should always be low enough (<32k)
image = image.astype(np.int16)
# Set outside-of-scan pixels to 0
# The intercept is usually -1024, so air is approximately 0
# suggested by Henry Wolf to avoid -2048 values
outside_of_image_val = image.min()
image[image == outside_of_image_val] = 0
image[image == -2000] = 0
# Convert to Hounsfield units (HU)
for slice_number in range(len(slices)):
intercept = slices[slice_number].RescaleIntercept
slope = slices[slice_number].RescaleSlope
if slope != 1:
image[slice_number] = slope * image[slice_number].astype(np.float64)
image[slice_number] = image[slice_number].astype(np.int16)
image[slice_number] += np.int16(intercept)
return np.array(image, dtype=np.int16)
Let's take a look at 1 of the patients.
In [57]:
patients[0]
Out[57]:
In [58]:
first_patient = load_scan(INPUT_FOLDER + patients[0])
In [59]:
print(type(first_patient));print(len(first_patient));print(type(first_patient[0]))
print(type(first_patient[0].pixel_array)); print(first_patient[0].pixel_array.shape)
In [94]:
first_patient[0]
Out[94]:
In [95]:
dir( first_patient[0] )
Out[95]:
In [59]:
first_patient_pixels = get_pixels_hu(first_patient)
In [60]:
print(type(first_patient_pixels));print(first_patient_pixels.shape)
In [78]:
pd.DataFrame( first_patient[0].pixel_array ).describe()
Out[78]:
In [79]:
pd.DataFrame( first_patient_pixels[0] ).describe()
Out[79]:
In [61]:
plt.hist(first_patient_pixels.flatten(),bins=80,color='c')
plt.xlabel("Hounsfield Units (HU)")
plt.ylabel("Frequency")
plt.show()
# Show some slice in the middle
plt.imshow(first_patient_pixels[80],cmap=plt.cm.gray)
plt.show()
In [32]:
print(type(first_patient_pixels));print(first_patient_pixels.shape);print(first_patient_pixels.dtype)
In [35]:
pd.DataFrame(first_patient_pixels[0]).describe()
Out[35]:
In [62]:
def resample(image, scan, new_spacing=[1,1,1]):
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
resize_factor = spacing / new_spacing
new_real_shape = image.shape * resize_factor
new_shape = np.round(new_real_shape)
real_resize_factor = new_shape / image.shape
new_spacing = spacing / real_resize_factor
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
return image, new_spacing
Let's resample our patient's pixels to an isomorphic resolution of 1 by 1 by 1 mm.
In [63]:
pix_resampled, spacing = resample(first_patient_pixels, first_patient, [1,1,1])
In [64]:
print("Shape before resampling\t", first_patient_pixels.shape)
print("Shape after resample\t",pix_resampled.shape)
In [39]:
print(first_patient[0].SliceThickness);print(first_patient[0].PixelSpacing)
In [40]:
np.array( [2.5]+first_patient[0].PixelSpacing)/[1,1,1]
Out[40]:
In [42]:
first_patient_pixels.shape
Out[42]:
In [81]:
# My stuff, each scan or image "slice" is a dicom.dataSet.FileDataSet
print(first_patient[0].SliceThickness)
print(first_patient[0].PixelSpacing)
In [83]:
print( np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1] )
In [82]:
first_patient_pixels.shape
Out[82]:
In [100]:
first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1]
Out[100]:
In [101]:
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1] )
Out[101]:
In [103]:
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1] ) / first_patient_pixels.shape
Out[103]:
In [102]:
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1] ) / first_patient_pixels.shape
Out[102]:
In [86]:
print(type(pix_resampled));print(type(spacing));print(pix_resampled.shape);print(spacing.shape); print(spacing)
In [88]:
for image_slice in first_patient:
print(image_slice.pixel_array.shape)
In [107]:
[1000,2000,3000]/np.array(first_patient_pixels.shape).astype("float32")
Out[107]:
In [79]:
def resample_given_dims(image, scan, new_shape):
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32) # (\Delta z,\Delta x,\Delta y)
# print(spacing)
real_resize_factor=new_shape/np.array(image.shape).astype("float32")
# print(real_resize_factor)
new_spacing = spacing/real_resize_factor # (\Delta z',\Delta x',\Delta y')
# real_resize_factor_zoom=np.round(real_resize_factor) # original
real_resize_factor_zoom = real_resize_factor
print(real_resize_factor_zoom)
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor_zoom, mode='nearest')
return image, new_spacing
In [80]:
# pix_resampled, spacing = resample(first_patient_pixels, first_patient, [1,1,1])
pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])
In [81]:
pd.DataFrame(pix_resampled_given[0]).describe()
Out[81]:
In [74]:
spacing_given
Out[74]:
In [ ]:
#def resample(image, scan, new_spacing=[1,1,1]):
Use marching cubes to create an approximate mesh for our 3D object, and plot this with matplotlib. Quite slow and ugly, but the best we can do.
In [47]:
def plot_3d(image,threshold=-300):
# Position the scan upright,
# so the head of the patient would be at the top facing the camera
p = image.transpose(2,1,0)
verts, faces = measure.marching_cubes(p, threshold)
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
# Fancy indexing: `verts[faces]` to generate a collection of triangles
mesh = Poly3DCollection(verts[faces], alpha=0.70)
face_color = [0.45, 0.45, 0.75]
mesh.set_facecolor(face_color)
ax.add_collection3d(mesh)
ax.set_xlim(0, p.shape[0])
ax.set_ylim(0, p.shape[1])
ax.set_zlim(0, p.shape[2])
plt.show()
Our plot function tkaes a threshold argument which we can use to plot certain structures, such as all tissue or only the bones. 400 is a good threshold for showing the bones only (see Hounsfield unit table above).
In [48]:
plot_3d(pix_resampled,400)
In order to reduce the problem space, we can segment the lungs (and usually some tissue around it).
The steps:
In [92]:
help(measure.label)
In [49]:
def largest_label_volume(im, bg=-1):
vals, counts = np.unique(im, return_counts=True)
counts = counts[vals != bg]
vals = vals[vals != bg]
if len(counts) > 0:
return vals[np.argmax(counts)]
else:
return None
def segment_lung_mask(image, fill_lung_structures=True):
# no actually binary, but 1 and 2.
# 0 is treated as background, which we do not want
binary_image = np.array(image > -320, dtype=np.int8)+1
labels=measure.label(binary_image)
# Pick the pixel in the very corner to determine which label is air.
# Improvement: Pick multiple background labels from around the patient
# More resitant to "trays" on which the patient lays cutting the air
# around the person in half
background_label = labels[0,0,0]
# Fill the air around the person
binary_image[background_label == labels] = 2
# Method of filling the lung structures (that is superior to something like
# morphological closing)
if fill_lung_structures:
# For every slice we determine the largest solid structure
for i, axial_slice in enumerate(binary_image):
axial_slice = axial_slice - 1
labeling = measure.label(axial_slice)
l_max = largest_label_volume(labeling, bg=0)
if l_max is not None: # This slice contains some lung
binary_image[i][labeling != l_max] = 1
binary_image -= 1 # Make the image actual binary
binary_image = 1-binary_image # Invert it, lungs are now 1
# Remove other air pockets insided body
labels = measure.label(binary_image, background=0)
l_max = largest_label_volume(labels, bg=0)
if l_max is not None: # There are air pockets
binary_image[labels != l_max] =0
return binary_image
In [ ]:
# look at bolek's comment
# l_max = largest_label_volume(labels, bg=-1)
In [ ]:
# Gerome Pistre
def largest_label_volume(im, bg=-1):
vals, counts = np.unique(im, return_counts=True)
counts = counts[vals != bg]
vals = vals[vals != bg]
biggest=vals[np.argmax(counts)]
return biggest
In [ ]:
def segment_lung_mask(image, fill_lung_structures=True):
# no actually binary, but 1 and 2.
# 0 is treated as background, which we do not want
binary_image = np.array(image > -320, dtype=np.int8)+1
labels=measure.label(binary_image)
# Pick the pixel in the very corner to determine which label is air.
# Improvement: Pick multiple background labels from around the patient
# More resitant to "trays" on which the patient lays cutting the air
# around the person in half
background_label = labels[0,0,0]
# Fill the air around the person
binary_image[background_label == labels] = 2
# Method of filling the lung structures (that is superior to something like
# morphological closing)
if fill_lung_structures:
# For every slice we determine the largest solid structure
for i, axial_slice in enumerate(binary_image):
axial_slice = axial_slice - 1
labeling = measure.label(axial_slice)
l_max = largest_label_volume(labeling, bg=0)
if l_max is not None: # This slice contains some lung
binary_image[i][labeling != l_max] = 1
binary_image -= 1 # Make the image actual binary
binary_image = 1-binary_image # Invert it, lungs are now 1
# Remove other air pockets insided body
labels = measure.label(binary_image, background=0)
l_max = largest_label_volume(labels, bg=0)
if l_max is not None: # There are air pockets
binary_image[labels != l_max] =0
return binary_image
In [50]:
segmented_lungs = segment_lung_mask(pix_resampled, False)
segmented_lungs_fill = segment_lung_mask(pix_resampled, True)
In [51]:
plot_3d(segmented_lungs,0)
In [52]:
plot_3d(segmented_lungs_fill,0)
In [53]:
plot_3d(segmented_lungs_fill - segmented_lungs,0)
In [91]:
plot_3d(pix_resampled,0)
In [90]:
print(type(segmented_lungs));print(type(segmented_lungs_fill));
print(segmented_lungs.shape);print(segmented_lungs_fill.shape)
In [54]:
# MANUALLY change MIN_BOUND, MAX_BOUND
MIN_BOUND=-1000.0
MAX_BOUND=400.0
def normalize(image):
image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
image[image>1]=1.
image[image<0]=0.
return image
As a final preprocessing step, it's advisory to 0 center your data so that your mean value is 0. To do this you simply subtract the mean pixel value from all pixels.
To determine this mean you simply average all images in the whole dataset. If that sounds like a lot of work, we found this to be around 0.25 in the LUNA16 competition.
Warning: Do not zero center with the mean per image (like is done in some kernels on here). The CT scanners are calibrated to return accurate HU measurements. There is no such thing as an image with lower contrast or brightness like in normal pictures.
In [55]:
PIXEL_MEAN = 0.25
def zero_center(image):
image = image - PIXEL_MEAN
return image
In [ ]:
# from rolanddog
# https://www.kaggle.com/gzuidhof/data-science-bowl-2017/full-preprocessing-tutorial
# zero-center, and preserve compressibility
MIN_BOUND = -1000.0
MAX_BOUND = 400.0
PIXEL_MEAN = 0.25
PIXEL_CORR = int((MAX_BOUND -MIN_BOUND)*PIXEL_MEAN) # in this case, 350
def zero_center_w_compress(image):
image = image - PIXEL_CORR
return image
In [ ]:
def normalize(image):
image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
image[image>(1-PIXEL_MEAN)]=1.
image[image<(0-PIXEL_MEAN)]=0.
return image
In [159]:
def normalize_to_bounds(image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
image[image>(1-PIXEL_MEAN)]=1.
image[image<(0-PIXEL_MEAN)]=0.
return image
In [44]:
def normalize(image,MIN_BOUND,MAX_BOUND):
image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
image[image>1]=1.
image[image<0]=0.
return image
In [160]:
def zero_center_w_compress_to_bounds(image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
PIXEL_CORR = int((MAX_BOUND -MIN_BOUND)*PIXEL_MEAN)
image = image - PIXEL_CORR
return image
In [45]:
def zero_center(image, PIXEL_MEAN):
image = image - PIXEL_MEAN
return image
In [65]:
import skimage
In [66]:
skimage.morphology.binary_dilation
Out[66]:
General advice from Guido Zuidhof: http://pubs.rsna.org/doi/pdf/10.1148/radiol.11091710
In [ ]:
In [104]:
len(first_patient)
Out[104]:
In [105]:
%time patients_lst = [load_scan(INPUT_FOLDER+samplepatient) for samplepatient in patients]
In [112]:
pd.DataFrame( [len(patient) for patient in patients_lst] ).describe()
Out[112]:
In [116]:
print( pd.DataFrame( [len(patient) for patient in patients_lst]).mean() )
print( pd.DataFrame( [len(patient) for patient in patients_lst]).median() )
In [117]:
INPUT_FOLDER+patients[0]
Out[117]:
In [119]:
len( os.listdir(INPUT_FOLDER+patients[0]) )
Out[119]:
In [120]:
len(patients_lst[0])
Out[120]:
In [30]:
def slices_per_patient(input_folder_path):
""" slices_per_patient
@type input_folder_path : Python string
"""
patients = os.listdir(input_folder_path)
patients.sort()
patient_slices_lst = []
for patient in patients:
Nz = len( os.listdir(input_folder_path + patient))
patient_slices_lst.append(Nz)
return patient_slices_lst
In [122]:
slices_per_patient_sample = slices_per_patient( INPUT_FOLDER)
In [125]:
print(len(slices_per_patient_sample))
pd.DataFrame( slices_per_patient_sample).describe()
Out[125]:
In [ ]:
# Load the scans in given folder path
def load_scan(path):
"""
INPUTS/ARGUMENTS
================
@type path : Python string
@type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset
each is a slice of the single patient's lung
"""
slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.SliceThickness = slice_thickness
return slices
In [158]:
print(type(patients_resampled));print(len(patients_resampled));
print(type(patients_resampled[0]));print(patients_resampled[0].shape);print(patients_resampled[7].shape)
In [93]:
os.listdir( os.getcwd() + "/2017datascibowl/sample_images/")
Out[93]:
In [97]:
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()
In [98]:
first_patient = load_scan(INPUT_FOLDER + patients[0])
In [99]:
# from Python list of 2-dim. numpy array/matrix to 3-dim. numpy array/matrix
first_patient_pixels = get_pixels_hu(first_patient)
In [163]:
patients_resampled[0][0].shape
Out[163]:
In [165]:
patients_lst[0][0].shape
In [167]:
print(type(patients_resampled[0].shape))
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
""" norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that
represents all (total) 2-dim. image slices for a single patient"""
Nz = patient_image.shape[0]
new_patient_image = []
for z_idx in range(Nz):
patient_normed=normalize_to_bounds(patient_image[z_idx],MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
patient_normed=zero_center_w_compress_to_bounds(patient_normed,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
new_patient_image.append( patient_normed)
new_patient_image=np.array(new_patient_image)
return new_patient_image
In [176]:
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
""" norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that
represents all (total) 2-dim. image slices for a single patient"""
Nz = patient_image.shape[0]
new_patient_image = []
for z_idx in range(Nz):
patient_normed=normalize_to_bounds(patient_image[z_idx],MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
new_patient_image.append( patient_normed)
new_patient_image=np.array(new_patient_image)
return new_patient_image
In [42]:
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
""" norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that
represents all (total) 2-dim. image slices for a single patient"""
Nz = patient_image.shape[0]
new_patient_image = []
for z_idx in range(Nz):
patient_normed=normalize(patient_image[z_idx],MIN_BOUND,MAX_BOUND)
patient_normed=zero_center(patient_normed,PIXEL_MEAN)
new_patient_image.append( patient_normed)
new_patient_image=np.array(new_patient_image)
return new_patient_image
In [169]:
print(type(patients_resampled))
In [185]:
print(len( patients_resampled_normed ))
print(pd.DataFrame(patients_resampled_normed[0][0]).describe())
pd.DataFrame(patients_resampled_normed[4][6]).describe()
Out[185]:
In [186]:
os.listdir('./2017datascibowl/')
Out[186]:
In [192]:
yhat_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
In [193]:
yhat_ids.head()
Out[193]:
In [201]:
yhat_ids_found = yhat_ids.loc[yhat_ids['id'].isin(patients)]
In [209]:
yhat_ids_found;
In [212]:
#yhat_ids_found['id']-pd.DataFrame(patients)
In [218]:
Set( yhat_ids_found['id']).symmetric_difference(patients)
Out[218]:
In [225]:
yhat_ids['id'].iloc[:70];
In [231]:
pd.DataFrame(patients)
Out[231]:
In [245]:
# pd.concat([pd.DataFrame(patients), yhat_ids_found['id']],join="inner",ignore_index=True)
patients[0] in yhat_ids_found['id'].as_matrix()
Out[245]:
In [256]:
m = len(patients)
found_indices =[]
for i in range(m):
if patients[i] in yhat_ids_found['id'].as_matrix():
found_indices.append(i)
In [257]:
patients_resampled_found = [patients[i] for i in found_indices]
In [263]:
for i in range(len(found_indices)):
print(patients_resampled_found[i] in yhat_ids_found['id'].as_matrix())
In [264]:
y_found=[]
for i in range(len(found_indices)):
if (patients_resampled_found[i] in yhat_ids_found['id'].as_matrix()):
y_found.append( yhat_ids_found['cancer'] )
In [294]:
os.listdir(os.getcwd())
Out[294]:
In [126]:
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()
In [133]:
Nz_lst = slices_per_patient(INPUT_FOLDER)
Nz_tot= int( pd.DataFrame(Nz_lst).median() )
In [134]:
%time patients_lst = [load_scan(INPUT_FOLDER+samplepatient) for samplepatient in patients]
Out[134]:
In [136]:
%time patients_pixels = [get_pixels_hu(patient) for patient in patients_lst]
In [140]:
pixels_and_patients=zip(patients_pixels,patients_lst)
In [152]:
Nx=128
Ny=128
# pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])
%time patients_resampled,spacings_new=map(list, \
zip(*[resample_given_dims(patient_pixels,\
patient,\
[Nz_tot,Nx,Ny]) for patient_pixels,patient in pixels_and_patients]))
In [168]:
MIN_BOUND=-1000.0
MAX_BOUND=500.0
PIXEL_MEAN=0.25
In [182]:
patients_resampled_normed = [norm_zero_images(patient_image,\
MIN_BOUND,\
MAX_BOUND,\
PIXEL_MEAN) for patient_image in patients_resampled]
In [292]:
y_found
Out[292]:
In [290]:
print(len(patients_resampled_normed_found))
In [265]:
os.listdir('./2017datascibowl/');
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
In [266]:
y_ids_found=y_ids.loc[yhat_ids['id'].isin(patients)]
In [267]:
m = len(patients)
found_indices =[]
for i in range(m):
if patients[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
In [268]:
patients_resampled_found = [patients[i] for i in found_indices]
In [282]:
y_found=[]
for i in range(len(found_indices)):
if (patients_resampled_found[i] in yhat_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_resampled_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
In [288]:
y_found=np.array(y_found).flatten()
In [289]:
patients_resampled_normed_found=[patients_resampled_normed[idx] for idx in found_indices]
In [275]:
patients_resampled_found[2]
Out[275]:
In [287]:
np.array(y_found).flatten()
Out[287]:
In [2]:
os.listdir('./2017datascibowl/')
Out[2]:
In [4]:
stage1_labels_csv = pd.read_csv("./2017datascibowl/stage1_labels.csv")
In [7]:
stage1_labels_csv.head()
Out[7]:
In [8]:
stage1_labels_csv.describe()
Out[8]:
In [12]:
stage1_labels_csv.as_matrix().shape
Out[12]:
In [9]:
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")
In [10]:
stage1_sample_submission_csv.head()
Out[10]:
In [11]:
stage1_sample_submission_csv.describe()
Out[11]:
In [13]:
stage1_sample_submission_csv.as_matrix().shape
Out[13]:
In [14]:
stage1_labels_csv.cancer.median()
Out[14]:
In [15]:
stage1_sample_submission_csv["cancer"]
Out[15]:
In [17]:
patients_stage1 = os.listdir('./2017datascibowl/stage1')
In [18]:
print(len(patients_stage1))
In [19]:
stage1_labels_csv["id"].describe()
Out[19]:
In [21]:
stage1_labels_csv["id"].as_matrix()
Out[21]:
In [23]:
len( set(patients_stage1).difference(set(stage1_labels_csv["id"].as_matrix())) )
Out[23]:
In [24]:
set(stage1_labels_csv["id"].as_matrix()).difference( set(patients_stage1))
Out[24]:
In [26]:
len( set(patients_stage1).symmetric_difference(set(stage1_labels_csv["id"].as_matrix())) )
Out[26]:
In [28]:
len(patients_stage1)
Out[28]:
In [31]:
Nz_lst = slices_per_patient('./2017datascibowl/stage1/')
Nz_tot= int( pd.DataFrame(Nz_lst).median() )
In [35]:
patient_0 = load_scan('./2017datascibowl/stage1/'+patients_stage1[0])
In [38]:
patient_pixels_0 = get_pixels_hu(patient_0)
In [41]:
Nx=128
Ny=128
# pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])
%time patient_resampled_0,spacing_new_0=resample_given_dims(patient_pixels_0,\
patient_0,\
[Nz_tot,Nx,Ny])
In [47]:
MIN_BOUND=-1000.0
MAX_BOUND=500.0
PIXEL_MEAN=0.25
patient_resampled_norm_0=norm_zero_images(patient_resampled_0,\
MIN_BOUND,\
MAX_BOUND,\
PIXEL_MEAN)
In [51]:
patient_resampled_norm_0.flatten().shape
Out[51]:
In [50]:
spacing_new_0
Out[50]:
In [65]:
print( patient_0[0].ImagePositionPatient )
print( patient_0[0].ImageOrientationPatient )
print( patient_0[1].ImagePositionPatient )
print( patient_0[1].ImageOrientationPatient )
print( patient_0[2].ImagePositionPatient )
print( patient_0[2].ImageOrientationPatient )
In [66]:
patient_0[0].ImageOrientationPatient
Out[66]:
In [54]:
patient_0[1]
Out[54]:
In [61]:
print(patient_0[0].ImagePositionPatient)
print(patient_0[0].SliceThickness)
print(patient_0[0].PixelSpacing)
dir(patient_0[0]);
In [62]:
print(patient_0[0].ImageOrientationPatient)
In [63]:
patient_0[0]
Out[63]:
In [68]:
np.array(patient_0[0].ImagePositionPatient[:2])
Out[68]:
In [67]:
np.array(patient_0[0].ImageOrientationPatient)
Out[67]:
In [71]:
patient_resampled_norm_0.flatten()
Out[71]:
In [70]:
np.concatenate([patient_resampled_norm_0.flatten(),
np.array(patient_0[0].ImagePositionPatient[:2]),
np.array( patient_0[0].ImageOrientationPatient) ] ).shape
Out[70]:
In [54]:
def process_patient(patientname,
MIN_BOUND=-1000.0,
MAX_BOUND=500.0,
PIXEL_MEAN=0.25,
INPUT_FOLDER='./2017datascibowl/stage1/',
N_x=128,N_y=128,):
""" process_patient - process single patient """
patient = load_scan(INPUT_FOLDER +patientname)
Nz_lst = slices_per_patient( INPUT_FOLDER )
print( "Nz_lst: %d" % len(Nz_lst))
Nz_tot= int( pd.DataFrame(Nz_lst).median() )
print( "\n Nz_tot : %d" % Nz_tot)
patient_pixels = get_pixels_hu(patient)
print( patient_pixels.shape)
patient_resampled,spacing_new=resample_given_dims(patient_pixels,patient,[Nz_tot,N_x,N_y])
print(patient_resampled.shape)
patient_resampled_norm=norm_zero_images(patient_resampled,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
patient_feature_vec = np.concatenate([patient_resampled_norm.flatten(),
np.array(patient[0].ImagePositionPatient[:2]),
np.array( patient[0].ImageOrientationPatient) ] )
return patient_feature_vec
In [41]:
def save_feat_vec(patient_feature_vec,patientname,sub_name="stage1_feat"):
#def save_feat_vec(patient_feature_vec,patientname): original
# f=file( "./2017datascibowl/stage1_feat/"+patientname + "feat_vec" ,"wb")
f=file( "./2017datascibowl/"+sub_name+"/"+patientname + "feat_vec" ,"wb")
np.save(f,patient_feature_vec)
f.close()
In [55]:
patient0_feature_vec = process_patient(patients_stage1[0])
In [43]:
save_feat_vec( patient0_feature_vec, patients_stage1[0])
In [90]:
os.getcwd()
Out[90]:
In [44]:
patient0_feature_vec
Out[44]:
In [93]:
test_patient0_feat_vec=np.load("./2017datascibowl/stage1_feat/"+patients_stage1[0]+"feat_vec")
In [94]:
test_patient0_feat_vec
Out[94]:
In [95]:
test_patient0_feat_vec.shape
Out[95]:
In [96]:
for patient_name in patients_stage1:
patient_feature_vec = process_patient(patient_name)
save_feat_vec( patient_feature_vec, patient_name)
In [97]:
len(patients_stage1)
Out[97]:
In [98]:
patients_stage1_feat = os.listdir('./2017datascibowl/stage1_feat')
print(len(patients_stage1_feat))
In [2]:
# Load the scans in given folder path
def load_scan(path):
"""
INPUTS/ARGUMENTS
================
@type path : Python string
@type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset
each is a slice of the single patient's lung
"""
slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.SliceThickness = slice_thickness
return slices
In [3]:
def slices_per_patient(input_folder_path):
""" slices_per_patient
@type input_folder_path : Python string
"""
patients = os.listdir(input_folder_path)
patients.sort()
patient_slices_lst = []
for patient in patients:
Nz = len( os.listdir(input_folder_path + patient))
patient_slices_lst.append(Nz)
return patient_slices_lst
In [4]:
def get_pixels_hu(slices):
"""
INPUTS/ARGUMENTS
================
@type slices : Python list of dicom.dataset.FileDataset,
@param slices : each dicom.dataset.FileDataset representing an image "slice" of a single patient's lung
"""
image = np.stack([s.pixel_array for s in slices]) # np.array image.shape (134,512,512)
# Convert to int16 (from sometimes int16),
# should be possible as values should always be low enough (<32k)
image = image.astype(np.int16)
# Set outside-of-scan pixels to 0
# The intercept is usually -1024, so air is approximately 0
# suggested by Henry Wolf to avoid -2048 values
outside_of_image_val = image.min()
image[image == outside_of_image_val] = 0
image[image == -2000] = 0
# Convert to Hounsfield units (HU)
for slice_number in range(len(slices)):
intercept = slices[slice_number].RescaleIntercept
slope = slices[slice_number].RescaleSlope
if slope != 1:
image[slice_number] = slope * image[slice_number].astype(np.float64)
image[slice_number] = image[slice_number].astype(np.int16)
image[slice_number] += np.int16(intercept)
return np.array(image, dtype=np.int16)
In [5]:
def resample_given_dims(image, scan, new_shape):
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32) # (\Delta z,\Delta x,\Delta y)
real_resize_factor=new_shape/np.array(image.shape).astype("float32")
new_spacing = spacing/real_resize_factor # (\Delta z',\Delta x',\Delta y')
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
return image, new_spacing
In [6]:
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
""" norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that
represents all (total) 2-dim. image slices for a single patient"""
Nz = patient_image.shape[0]
new_patient_image = []
for z_idx in range(Nz):
patient_normed=normalize(patient_image[z_idx],MIN_BOUND,MAX_BOUND)
patient_normed=zero_center(patient_normed,PIXEL_MEAN)
new_patient_image.append( patient_normed)
new_patient_image=np.array(new_patient_image)
return new_patient_image
In [7]:
def normalize(image,MIN_BOUND,MAX_BOUND):
image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
image[image>1]=1.
image[image<0]=0.
return image
In [8]:
def zero_center(image, PIXEL_MEAN):
image = image - PIXEL_MEAN
return image
In [9]:
def process_patient(patientname,
MIN_BOUND=-1000.0,
MAX_BOUND=500.0,
PIXEL_MEAN=0.25,
INPUT_FOLDER='./2017datascibowl/stage1/',
N_x=128,N_y=128,):
""" process_patient - process single patient """
patient = load_scan(INPUT_FOLDER +patientname)
Nz_lst = slices_per_patient( INPUT_FOLDER )
Nz_tot= int( pd.DataFrame(Nz_lst).median() )
patient_pixels = get_pixels_hu(patient)
patient_resampled,spacing_new=resample_given_dims(patient_pixels,patient,[Nz_tot,N_x,N_y])
patient_resampled_norm=norm_zero_images(patient_resampled,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
patient_feature_vec = np.concatenate([patient_resampled_norm.flatten(),
np.array(patient[0].ImagePositionPatient[:2]),
np.array( patient[0].ImageOrientationPatient) ] )
return patient_feature_vec
In [10]:
def save_feat_vec(patient_feature_vec,patientname,sub_name="stage1_feat"):
#def save_feat_vec(patient_feature_vec,patientname): original
# f=file( "./2017datascibowl/stage1_feat/"+patientname + "feat_vec" ,"wb")
f=file( "./2017datascibowl/"+sub_name+"/"+patientname + "feat_vec" ,"wb")
np.save(f,patient_feature_vec)
f.close()
In [11]:
import os, sys
os.getcwd()
os.listdir( os.getcwd() )
Out[11]:
In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
import scipy.ndimage
import matplotlib.pyplot as plt
In [13]:
patients_stage1 = os.listdir('./2017datascibowl/stage1')
print(len(patients_stage1))
In [14]:
N_x=32
N_y=32
In [15]:
%time
for patient_name in patients_stage1:
patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres32")
In [117]:
%time
for patient_name in patients_stage1:
patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres32")
In [16]:
import time
In [17]:
N_x=64
N_y=64
t1 = time.time()
for patient_name in patients_stage1:
patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres"+str(N_x))
t2=time.time()
print((t2-t1)*1000*1000.) # in seconds
In [115]:
patient_feature_vec_test = process_patient(patients_stage1[0],N_x=N_x,N_y=N_y)
In [116]:
patient_feature_vec_test.shape
Out[116]:
In [ ]: