Simple script for extracting patches from LUNA16 dataset

This is a first pass. Let's keep things simple. The goal is to just extract 64x64 pixel patches around just the transverse slices with the candidates in the center. We'll have some 700k images. Only about 1,100 patches will have class 1 (true nodule). The remainder will be class 0 (non-nodule). We'll take this data and run it though a modified VGG classifier (done in a second script). If the classifier can make a good class prediction, then we know we've got data that will work with more advanced models (e.g. Faster R-CNN to both localize and classify the candidates in the full slice images)


In [1]:
import SimpleITK as sitk
import numpy as np
import pandas as pd
import os

import ntpath

In [2]:
# To get the data:
# wget https://www.dropbox.com/sh/mtip9dx6zt9nb3z/AAAs2wbJxbNM44-uafZyoMVca/subset5.zip
# The files are 7-zipped. Regular linux unzip won't work to uncompress them. Use 7za instead.
# 7za e subset5.zip

DATA_DIR = "/Volumes/data/tonyr/dicom/LUNA16/"
cand_path = 'CSVFILES/candidates_with_annotations.csv'

In [3]:
def extractCandidates(img_file):
    
    # Get the name of the file
    subjectName = ntpath.splitext(ntpath.basename(img_file))[0]  # Strip off the .mhd extension
    
    # Read the list of candidate ROI
    dfCandidates = pd.read_csv(DATA_DIR+cand_path)
    
    numCandidates = dfCandidates[dfCandidates['seriesuid']==subjectName].shape[0]
    print('There are {} candidate nodules in this file.'.format(numCandidates))
    
    numNonNodules = sum(dfCandidates[dfCandidates['seriesuid']==subjectName]['class'] == 0)
    numNodules = sum(dfCandidates[dfCandidates['seriesuid']==subjectName]['class'] == 1)
    #print('{} are true nodules (class 1) and {} are non-nodules (class 0)'.format(numNodules, numNonNodules))
    
    # Read if the candidate ROI is a nodule (1) or non-nodule (0)
    candidateValues = dfCandidates[dfCandidates['seriesuid']==subjectName]['class'].values
    
    # Get the world coordinates (mm) of the candidate ROI center
    worldCoords = dfCandidates[dfCandidates['seriesuid']==subjectName][['coordX', 'coordY', 'coordZ']].values
    
    # Use SimpleITK to read the mhd image
    itkimage = sitk.ReadImage(img_file)
    
    # Get the real world origin (mm) for this image
    originMatrix = np.tile(itkimage.GetOrigin(), (numCandidates,1))  # Real world origin for this image (0,0)
    
    # Subtract the real world origin and scale by the real world (mm per pixel)
    # This should give us the X,Y,Z coordinates for the candidates
    candidatesPixels = (np.round(np.absolute(worldCoords - originMatrix) / itkimage.GetSpacing())).astype(int)
    
    # Replace the missing diameters with the 50th percentile diameter 
    
    candidateDiameter = dfCandidates['diameter_mm'].fillna(dfCandidates['diameter_mm'].quantile(0.5)).values / itkimage.GetSpacing()[1]
     
    candidatePatches = []
    
    imgAll = sitk.GetArrayFromImage(itkimage) # Read the image volume
    
    for candNum in range(numCandidates):
        
        #print('Extracting candidate patch #{}'.format(candNum))
        candidateVoxel = candidatesPixels[candNum,:]
        xpos = int(candidateVoxel[0])
        ypos = int(candidateVoxel[1])
        zpos = int(candidateVoxel[2])
        
        # Need to handle the candidates where the window would extend beyond the image boundaries
        windowSize = 32
        x_lower = np.max([0, xpos - windowSize])  # Return 0 if position off image
        x_upper = np.min([xpos + windowSize, itkimage.GetWidth()]) # Return  maxWidth if position off image
        
        y_lower = np.max([0, ypos - windowSize])  # Return 0 if position off image
        y_upper = np.min([ypos + windowSize, itkimage.GetHeight()]) # Return  maxHeight if position off image
         
        # SimpleITK is x,y,z. Numpy is z, y, x.
        imgPatch = imgAll[zpos, y_lower:y_upper, x_lower:x_upper]
        
        # Normalize to the Hounsfield units
        # TODO: I don't think we should normalize into Housefield units
        imgPatchNorm = imgPatch #normalizePlanes(imgPatch)
        
        candidatePatches.append(imgPatchNorm)  # Append the candidate image patches to a python list

    return candidatePatches, candidateValues, candidateDiameter

In [ ]:


In [4]:
from scipy.misc import toimage

# We need to save the array as an image.
# This is the easiest way. Matplotlib seems to like adding a white border that is hard to kill.

def SavePatches(img_file, patchesArray, valuesArray):
    
    saveDir = ntpath.dirname(img_file) + '/patches'

    try:
        os.stat(saveDir)
    except:
        os.mkdir(saveDir) 

    subjectName = ntpath.splitext(ntpath.basename(img_file))[0]

    print('Saving image patches for file {}.'.format(subjectName))
    for i in range(len(valuesArray)):

        print('\r{} of {}'.format(i+1, len(valuesArray))),
        im = toimage(patchesArray[i])
        im.save(saveDir + '/{}_{}_{}.png'.format(subjectName, i, valuesArray[i]))

    print('Finished {}'.format(subjectName))

In [5]:
i = 0

for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if (file.endswith('.mhd')) & ('__MACOSX' not in root):  # Don't get the Macintosh directory
            
            img_file = os.path.join(root, file)
            patchesArray, valuesArray, noduleDiameter = extractCandidates(img_file)   
             
            SavePatches(img_file, patchesArray, valuesArray)


There are 1129 candidate nodules in this file.
Saving image patches for file 1.3.6.1.4.1.14519.5.2.1.6279.6001.126264578931778258890371755354.
1129 of 1129                                                                                                                                                                                                                                             
Finished 1.3.6.1.4.1.14519.5.2.1.6279.6001.126264578931778258890371755354
There are 1262 candidate nodules in this file.
Saving image patches for file 1.3.6.1.4.1.14519.5.2.1.6279.6001.130438550890816550994739120843.
601 of 1262                                                                                                                 
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-b76582fee06c> in <module>()
      8             patchesArray, valuesArray, noduleDiameter = extractCandidates(img_file)
      9 
---> 10             SavePatches(img_file, patchesArray, valuesArray)
     11 
     12             i += 1

<ipython-input-4-3fafdfad72d9> in SavePatches(img_file, patchesArray, valuesArray)
     20         print('\r{} of {}'.format(i+1, len(valuesArray))),
     21         im = toimage(patchesArray[i])
---> 22         im.save(saveDir + '/{}_{}_{}.png'.format(subjectName, valuesArray[i], i))
     23 
     24     print('')

/Users/ganthony/anaconda/envs/neon/lib/python2.7/site-packages/PIL/Image.pyc in save(self, fp, format, **params)
   1687             # do what we can to clean up
   1688             if open_fp:
-> 1689                 fp.close()
   1690 
   1691     def seek(self, frame):

KeyboardInterrupt: 

In [ ]: