Data, Data Science Bowl 2017

cf. https://www.kaggle.com/c/data-science-bowl-2017/data

I had to sudo dnf install p7zip and install p7zip (yet another software repo) just to extract the images.

data_password.txt.zip - contains decryption key for the image files; needed to extract. 9$kAsfpQ*FtH
sample_images.7z - smaller subset of full dataset, provided for people who wish to preview images before large file stage1.7z, File size 781.39 MB



In [1]:

    
import os, sys
os.getcwd()
os.listdir( os.getcwd() )









    Out[1]:





['2017datascibowl',
 'data_password.txt',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'LSTM_model201702282350.save',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'DatSciBow2017_SVM.ipynb',
 '__init__.py',
 'train.h5',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'cleaning_dueSigmaFin.py']



In [7]:

    
os.listdir( os.getcwd() + "/2017datascibowl/sample_images/")









    Out[7]:





['0d2fcf787026fece4e57be167d079383',
 '0a0c32c9e08cc2ea76a71649de56be6d',
 '0de72529c30fe642bc60dcb75c87f6bd',
 '0b20184e0cd497028bdd155d9fb42dc9',
 '00cba091fa4ad62cc3200a657aeb957e',
 '0acbebb8d463b4b9ca88cf38431aac69',
 '0c0de3749d4fe175b7a5098b060982a1',
 '0ca943d821204ceb089510f836a367fd',
 '0d19f1c627df49eb223771c28548350e',
 '0ddeb08e9c97227853422bd71a2a695e',
 '0c98fcb55e3f36d0c2b6507f62f4c5f1',
 '0a38e7597ca26f9374f8ea2770ba870d',
 '0a099f2549429d29b32f349e95fb2244',
 '0c59313f52304e25d5a7dcf9877633b1',
 '0c9d8314f9c69840e25febabb1229fa4',
 '0c37613214faddf8701ca41e6d43f56e',
 '0d941a3ad6c889ac451caf89c46cb92a',
 '0d06d764d3c07572074d468b4cff954f',
 '0bd0e3056cbf23a1cb7f0f0b18446068',
 '0c60f4b87afcb3e2dfa65abbbf3ef2f9']



In [9]:

    
os.getcwd()









    Out[9]:





'/home/topolo/PropD/MLgrabbag/kaggle'

cf. Full Preprocessing Tutorial



In [2]:

    
%matplotlib inline



In [3]:

    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
import scipy.ndimage
import matplotlib.pyplot as plt



In [45]:

    
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection



In [3]:

    
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()



In [13]:

    
patients









    Out[13]:





['00cba091fa4ad62cc3200a657aeb957e',
 '0a099f2549429d29b32f349e95fb2244',
 '0a0c32c9e08cc2ea76a71649de56be6d',
 '0a38e7597ca26f9374f8ea2770ba870d',
 '0acbebb8d463b4b9ca88cf38431aac69',
 '0b20184e0cd497028bdd155d9fb42dc9',
 '0bd0e3056cbf23a1cb7f0f0b18446068',
 '0c0de3749d4fe175b7a5098b060982a1',
 '0c37613214faddf8701ca41e6d43f56e',
 '0c59313f52304e25d5a7dcf9877633b1',
 '0c60f4b87afcb3e2dfa65abbbf3ef2f9',
 '0c98fcb55e3f36d0c2b6507f62f4c5f1',
 '0c9d8314f9c69840e25febabb1229fa4',
 '0ca943d821204ceb089510f836a367fd',
 '0d06d764d3c07572074d468b4cff954f',
 '0d19f1c627df49eb223771c28548350e',
 '0d2fcf787026fece4e57be167d079383',
 '0d941a3ad6c889ac451caf89c46cb92a',
 '0ddeb08e9c97227853422bd71a2a695e',
 '0de72529c30fe642bc60dcb75c87f6bd']

Loading the files



In [14]:

    
# Load the scans in given folder path
def load_scan(path):
    """
    INPUTS/ARGUMENTS 
    ================
    @type path : Python string
    
    @type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset  
    each is a slice of the single patient's lung
    """
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices



In [69]:

    
image_test = np.stack([s.pixel_array for s in load_scan(INPUT_FOLDER+patients[0])])



In [71]:

    
print(image_test.shape);print(image_test.dtype)









    



(134, 512, 512)
int16



In [72]:

    
pd.DataFrame( image_test[0]).describe()









    Out[72]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      502
      503
      504
      505
      506
      507
      508
      509
      510
      511
    
  
  
    
      count
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      ...
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
    
    
      mean
      -1817.480469
      -1687.929688
      -1600.953125
      -1523.867188
      -1467.464844
      -1413.097656
      -1359.531250
      -1316.011719
      -1271.882812
      -1227.421875
      ...
      -1234.949219
      -1278.269531
      -1321.933594
      -1366.171875
      -1419.917969
      -1472.910156
      -1527.789062
      -1602.140625
      -1686.277344
      -1816.578125
    
    
      std
      707.955168
      893.832547
      991.304186
      1065.634957
      1114.995762
      1155.741711
      1191.352015
      1217.971285
      1243.673746
      1267.897664
      ...
      1257.301007
      1234.392217
      1209.006865
      1180.214144
      1143.906186
      1104.989222
      1057.894768
      988.719528
      897.840990
      711.176932
    
    
      min
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      25%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      50%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      75%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      58.500000
      178.000000
      ...
      101.500000
      28.500000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      max
      1062.000000
      1090.000000
      1068.000000
      1076.000000
      1076.000000
      1084.000000
      1118.000000
      1110.000000
      1118.000000
      1070.000000
      ...
      1070.000000
      1040.000000
      1046.000000
      1108.000000
      1032.000000
      1076.000000
      1092.000000
      1070.000000
      1018.000000
      1038.000000
    
  

8 rows × 512 columns



In [73]:

    
pd.DataFrame( image_test[1]).describe()









    Out[73]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      502
      503
      504
      505
      506
      507
      508
      509
      510
      511
    
  
  
    
      count
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      ...
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
    
    
      mean
      -1820.785156
      -1693.468750
      -1605.542969
      -1526.933594
      -1469.355469
      -1413.863281
      -1360.519531
      -1316.566406
      -1272.644531
      -1228.250000
      ...
      -1234.457031
      -1277.929688
      -1321.339844
      -1366.078125
      -1420.300781
      -1474.128906
      -1528.585938
      -1602.785156
      -1687.871094
      -1816.843750
    
    
      std
      696.348879
      880.072154
      981.704601
      1059.962254
      1111.883407
      1155.108780
      1190.254072
      1217.500664
      1242.583521
      1266.651168
      ...
      1257.511898
      1234.274330
      1209.383344
      1180.197301
      1143.005183
      1102.343928
      1056.282430
      987.581466
      894.045774
      710.258754
    
    
      min
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      25%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      50%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      75%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      51.000000
      165.000000
      ...
      129.500000
      42.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      max
      1070.000000
      1048.000000
      1088.000000
      1136.000000
      1072.000000
      1090.000000
      1080.000000
      1102.000000
      1070.000000
      1108.000000
      ...
      1044.000000
      1074.000000
      1042.000000
      1040.000000
      1042.000000
      1050.000000
      1072.000000
      1086.000000
      1064.000000
      1064.000000
    
  

8 rows × 512 columns



In [74]:

    
len(image_test == -2000) # set outside-of-scan pixels to 0









    Out[74]:





134



In [75]:

    
print( image_test.min() )

$N_x=512$, so $N_x\in \mathbb{Z}^+$
$N_y=512$, so $N_y\in \mathbb{Z}^+$
For each patient, $m_{\text{slice}} \in \mathbb{Z}^+$ represents the number of "slices", image "slices" of the single patient's lung.

So first make, for each patient, $(i_{\text{slice}}, P_{\text{arr}}) \in \lbrace 0,1,\dots m_{\text{slice}}-1 \rbrace \times \mathbb{K}^{N_x \times N_y } $



In [37]:

    
def get_pixels_hu(slices):
    """
    INPUTS/ARGUMENTS
    ================
    @type slices : Python list of dicom.dataset.FileDataset, 
    @param slices : each dicom.dataset.FileDataset representing an image "slice" of a single patient's lung
    """
    image = np.stack([s.pixel_array for s in slices]) # np.array image.shape (134,512,512)
    # Convert to int16 (from sometimes int16),
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)
    
    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    # suggested by Henry Wolf to avoid -2048 values
    outside_of_image_val = image.min()
    image[image == outside_of_image_val] = 0 
    
    image[image == -2000] = 0 
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
        
    return np.array(image, dtype=np.int16)

Let's take a look at 1 of the patients.



In [57]:

    
patients[0]









    Out[57]:





'00cba091fa4ad62cc3200a657aeb957e'



In [58]:

    
first_patient = load_scan(INPUT_FOLDER + patients[0])



In [59]:

    
print(type(first_patient));print(len(first_patient));print(type(first_patient[0]))
print(type(first_patient[0].pixel_array)); print(first_patient[0].pixel_array.shape)









    



<type 'list'>
134
<class 'dicom.dataset.FileDataset'>
<type 'numpy.ndarray'>
(512, 512)



In [94]:

    
first_patient[0]









    Out[94]:





(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113654.2.55.247817952625791837963403492891187883824
(0008, 0060) Modality                            CS: 'CT'
(0008, 103e) Series Description                  LO: 'Axial'
(0010, 0010) Patient's Name                      PN: '00cba091fa4ad62cc3200a657aeb957e'
(0010, 0020) Patient ID                          LO: '00cba091fa4ad62cc3200a657aeb957e'
(0010, 0030) Patient's Birth Date                DA: '19000101'
(0018, 0050) Slice Thickness                     DS: '2.5'
(0018, 0060) KVP                                 DS: ''
(0020, 000d) Study Instance UID                  UI: 2.25.86208730140539712382771890501772734277950692397709007305473
(0020, 000e) Series Instance UID                 UI: 2.25.11575877329635228925808596800269974740893519451784626046614
(0020, 0011) Series Number                       IS: '3'
(0020, 0012) Acquisition Number                  IS: '1'
(0020, 0013) Instance Number                     IS: '134'
(0020, 0020) Patient Orientation                 CS: ''
(0020, 0032) Image Position (Patient)            DS: ['-145.500000', '-158.199997', '-356.200012']
(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']
(0020, 0052) Frame of Reference UID              UI: 2.25.83033509634441686385652073462983801840121916678417719669650
(0020, 1040) Position Reference Indicator        LO: 'SN'
(0020, 1041) Slice Location                      DS: '-356.200012'
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 512
(0028, 0011) Columns                             US: 512
(0028, 0030) Pixel Spacing                       DS: ['0.597656', '0.597656']
(0028, 0100) Bits Allocated                      US: 16
(0028, 0101) Bits Stored                         US: 16
(0028, 0102) High Bit                            US: 15
(0028, 0103) Pixel Representation                US: 1
(0028, 0120) Pixel Padding Value                 US: 63536
(0028, 1050) Window Center                       DS: '40'
(0028, 1051) Window Width                        DS: '400'
(0028, 1052) Rescale Intercept                   DS: '-1024'
(0028, 1053) Rescale Slope                       DS: '1'
(7fe0, 0010) Pixel Data                          OW: Array of 524288 bytes



In [95]:

    
dir( first_patient[0] )









    Out[95]:





['AcquisitionNumber',
 'BitsAllocated',
 'BitsStored',
 'Columns',
 'FrameOfReferenceUID',
 'HighBit',
 'ImageOrientationPatient',
 'ImagePositionPatient',
 'InstanceNumber',
 'KVP',
 'Modality',
 'PatientBirthDate',
 'PatientID',
 'PatientName',
 'PatientOrientation',
 'PhotometricInterpretation',
 'PixelData',
 'PixelPaddingValue',
 'PixelRepresentation',
 'PixelSpacing',
 'PositionReferenceIndicator',
 'RescaleIntercept',
 'RescaleSlope',
 'Rows',
 'SOPClassUID',
 'SOPInstanceUID',
 'SamplesPerPixel',
 'SeriesDescription',
 'SeriesInstanceUID',
 'SeriesNumber',
 'SliceLocation',
 'SliceThickness',
 'SpecificCharacterSet',
 'StudyInstanceUID',
 'WindowCenter',
 'WindowWidth',
 '__cmp__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_character_set',
 '_get_pixel_array',
 '_pixel_data_numpy',
 '_pretty_str',
 'add',
 'add_new',
 'clear',
 'copy',
 'data_element',
 'decode',
 'dir',
 'formatted_lines',
 'fromkeys',
 'get',
 'get_item',
 'group_dataset',
 'has_key',
 'items',
 'iterall',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'pixel_array',
 'pop',
 'popitem',
 'remove_private_tags',
 'save_as',
 'setdefault',
 'top',
 'trait_names',
 'update',
 'values',
 'viewitems',
 'viewkeys',
 'viewvalues',
 'walk']



In [59]:

    
first_patient_pixels = get_pixels_hu(first_patient)



In [60]:

    
print(type(first_patient_pixels));print(first_patient_pixels.shape)









    



<type 'numpy.ndarray'>
(134, 512, 512)



In [78]:

    
pd.DataFrame( first_patient[0].pixel_array ).describe()









    Out[78]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      502
      503
      504
      505
      506
      507
      508
      509
      510
      511
    
  
  
    
      count
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      ...
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
    
    
      mean
      -1817.480469
      -1687.929688
      -1600.953125
      -1523.867188
      -1467.464844
      -1413.097656
      -1359.531250
      -1316.011719
      -1271.882812
      -1227.421875
      ...
      -1234.949219
      -1278.269531
      -1321.933594
      -1366.171875
      -1419.917969
      -1472.910156
      -1527.789062
      -1602.140625
      -1686.277344
      -1816.578125
    
    
      std
      707.955168
      893.832547
      991.304186
      1065.634957
      1114.995762
      1155.741711
      1191.352015
      1217.971285
      1243.673746
      1267.897664
      ...
      1257.301007
      1234.392217
      1209.006865
      1180.214144
      1143.906186
      1104.989222
      1057.894768
      988.719528
      897.840990
      711.176932
    
    
      min
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      25%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      50%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      ...
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      75%
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      58.500000
      178.000000
      ...
      101.500000
      28.500000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
      -2000.000000
    
    
      max
      1062.000000
      1090.000000
      1068.000000
      1076.000000
      1076.000000
      1084.000000
      1118.000000
      1110.000000
      1118.000000
      1070.000000
      ...
      1070.000000
      1040.000000
      1046.000000
      1108.000000
      1032.000000
      1076.000000
      1092.000000
      1070.000000
      1018.000000
      1038.000000
    
  

8 rows × 512 columns



In [79]:

    
pd.DataFrame( first_patient_pixels[0] ).describe()









    Out[79]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      502
      503
      504
      505
      506
      507
      508
      509
      510
      511
    
  
  
    
      count
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      ...
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
    
    
      mean
      -966.480469
      -930.679688
      -906.203125
      -883.804688
      -866.464844
      -851.160156
      -836.656250
      -824.386719
      -811.507812
      -798.296875
      ...
      -805.824219
      -817.894531
      -830.308594
      -843.296875
      -857.980469
      -871.910156
      -887.726562
      -907.390625
      -929.027344
      -965.578125
    
    
      std
      224.157330
      274.629669
      304.335981
      328.576139
      345.914498
      358.437364
      369.176445
      376.330536
      384.415168
      392.553492
      ...
      386.340177
      379.173388
      371.286996
      361.194807
      350.621332
      339.373499
      323.425095
      302.642898
      276.957186
      226.777876
    
    
      min
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      25%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      50%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      75%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -965.500000
      -846.000000
      ...
      -922.500000
      -995.500000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      max
      38.000000
      66.000000
      44.000000
      52.000000
      52.000000
      60.000000
      94.000000
      86.000000
      94.000000
      46.000000
      ...
      46.000000
      16.000000
      22.000000
      84.000000
      8.000000
      52.000000
      68.000000
      46.000000
      -6.000000
      14.000000
    
  

8 rows × 512 columns



In [61]:

    
plt.hist(first_patient_pixels.flatten(),bins=80,color='c')
plt.xlabel("Hounsfield Units (HU)")
plt.ylabel("Frequency")
plt.show()  

# Show some slice in the middle
plt.imshow(first_patient_pixels[80],cmap=plt.cm.gray)
plt.show()



In [32]:

    
print(type(first_patient_pixels));print(first_patient_pixels.shape);print(first_patient_pixels.dtype)









    



<type 'numpy.ndarray'>
(134, 512, 512)
int16



In [35]:

    
pd.DataFrame(first_patient_pixels[0]).describe()









    Out[35]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      502
      503
      504
      505
      506
      507
      508
      509
      510
      511
    
  
  
    
      count
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      ...
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
      512.000000
    
    
      mean
      -966.480469
      -930.679688
      -906.203125
      -883.804688
      -866.464844
      -851.160156
      -836.656250
      -824.386719
      -811.507812
      -798.296875
      ...
      -805.824219
      -817.894531
      -830.308594
      -843.296875
      -857.980469
      -871.910156
      -887.726562
      -907.390625
      -929.027344
      -965.578125
    
    
      std
      224.157330
      274.629669
      304.335981
      328.576139
      345.914498
      358.437364
      369.176445
      376.330536
      384.415168
      392.553492
      ...
      386.340177
      379.173388
      371.286996
      361.194807
      350.621332
      339.373499
      323.425095
      302.642898
      276.957186
      226.777876
    
    
      min
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      25%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      50%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      75%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -965.500000
      -846.000000
      ...
      -922.500000
      -995.500000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      max
      38.000000
      66.000000
      44.000000
      52.000000
      52.000000
      60.000000
      94.000000
      86.000000
      94.000000
      46.000000
      ...
      46.000000
      16.000000
      22.000000
      84.000000
      8.000000
      52.000000
      68.000000
      46.000000
      -6.000000
      14.000000
    
  

8 rows × 512 columns

Resampling

A scan may have a pixel spacing of [2.5, 0.5, 0.5], which means that the distance between slices is 2.5 millimeters. For a different scan this may be [1.5, 0.725, 0.725], this can be problematic for automatic analysis (e.g. using ConvNets)!



In [62]:

    
def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
    
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    
    return image, new_spacing

Let's resample our patient's pixels to an isomorphic resolution of 1 by 1 by 1 mm.



In [63]:

    
pix_resampled, spacing = resample(first_patient_pixels, first_patient, [1,1,1])



In [64]:

    
print("Shape before resampling\t", first_patient_pixels.shape)
print("Shape after resample\t",pix_resampled.shape)









    



('Shape before resampling\t', (134, 512, 512))
('Shape after resample\t', (335, 306, 306))



In [39]:

    
print(first_patient[0].SliceThickness);print(first_patient[0].PixelSpacing)









    



2.5
['0.597656', '0.597656']



In [40]:

    
np.array( [2.5]+first_patient[0].PixelSpacing)/[1,1,1]









    Out[40]:





array([ 2.5     ,  0.597656,  0.597656])



In [42]:

    
first_patient_pixels.shape









    Out[42]:





(134, 512, 512)



In [81]:

    
# My stuff, each scan or image "slice" is a dicom.dataSet.FileDataSet 
print(first_patient[0].SliceThickness)
print(first_patient[0].PixelSpacing)









    



2.5
['0.597656', '0.597656']



In [83]:

    
print( np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1] )









    



[ 2.5       0.597656  0.597656]



In [82]:

    
first_patient_pixels.shape









    Out[82]:





(134, 512, 512)



In [100]:

    
first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1]









    Out[100]:





array([ 335.      ,  305.999872,  305.999872])



In [101]:

    
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1]  )









    Out[101]:





array([ 335.,  306.,  306.])



In [103]:

    
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1]  ) / first_patient_pixels.shape









    Out[103]:





array([ 2.5       ,  0.59765625,  0.59765625])



In [102]:

    
np.round( first_patient_pixels.shape * np.array( [first_patient[0].SliceThickness]+first_patient[0].PixelSpacing)/ [1,1,1]  ) / first_patient_pixels.shape









    Out[102]:





array([ 2.5       ,  0.59765625,  0.59765625])



In [86]:

    
print(type(pix_resampled));print(type(spacing));print(pix_resampled.shape);print(spacing.shape); print(spacing)









    



<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(335, 306, 306)
(3,)
[ 1.         0.9999996  0.9999996]



In [88]:

    
for image_slice in first_patient:
    print(image_slice.pixel_array.shape)









    



(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 512)



In [107]:

    
[1000,2000,3000]/np.array(first_patient_pixels.shape).astype("float32")









    Out[107]:





array([ 7.46268657,  3.90625   ,  5.859375  ])



In [79]:

    
def resample_given_dims(image, scan, new_shape):
    # Determine current pixel spacing
    spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)  # (\Delta z,\Delta x,\Delta y)
#    print(spacing)
    real_resize_factor=new_shape/np.array(image.shape).astype("float32")  
#    print(real_resize_factor)
    new_spacing = spacing/real_resize_factor # (\Delta z',\Delta x',\Delta y')
    
#    real_resize_factor_zoom=np.round(real_resize_factor)  # original
    real_resize_factor_zoom = real_resize_factor
    print(real_resize_factor_zoom)
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor_zoom, mode='nearest')
    
    return image, new_spacing



In [80]:

    
# pix_resampled, spacing = resample(first_patient_pixels, first_patient, [1,1,1])
pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])









    



[ 1.24626866  0.25        0.25      ]



In [81]:

    
pd.DataFrame(pix_resampled_given[0]).describe()









    Out[81]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      118
      119
      120
      121
      122
      123
      124
      125
      126
      127
    
  
  
    
      count
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      ...
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
    
    
      mean
      -965.992188
      -865.835938
      -816.781250
      -766.789062
      -728.289062
      -700.179688
      -672.375000
      -641.742188
      -615.875000
      -584.078125
      ...
      -598.710938
      -624.515625
      -652.859375
      -673.984375
      -706.140625
      -731.085938
      -773.171875
      -818.945312
      -871.093750
      -965.960938
    
    
      std
      225.862947
      347.119594
      379.754499
      408.096921
      424.471235
      436.535383
      446.371901
      458.378780
      462.784894
      480.708853
      ...
      482.882147
      470.250930
      457.787606
      447.755100
      436.335487
      425.987265
      405.612458
      380.930219
      342.545921
      224.568261
    
    
      min
      -1024.000000
      -1028.000000
      -1029.000000
      -1025.000000
      -1042.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1039.000000
      ...
      -1032.000000
      -1030.000000
      -1028.000000
      -1026.000000
      -1025.000000
      -1043.000000
      -1025.000000
      -1029.000000
      -1028.000000
      -1024.000000
    
    
      25%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      ...
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      50%
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1023.000000
      -979.000000
      ...
      -990.500000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
      -1024.000000
    
    
      75%
      -1024.000000
      -1024.000000
      -949.000000
      -164.250000
      -151.750000
      -123.500000
      -128.500000
      -100.750000
      -95.000000
      -79.000000
      ...
      -82.500000
      -101.500000
      -107.500000
      -112.750000
      -121.000000
      -125.250000
      -167.750000
      -977.250000
      -1024.000000
      -1024.000000
    
    
      max
      2.000000
      53.000000
      -16.000000
      -9.000000
      10.000000
      -33.000000
      16.000000
      -4.000000
      15.000000
      148.000000
      ...
      107.000000
      129.000000
      57.000000
      0.000000
      -21.000000
      -9.000000
      -19.000000
      7.000000
      50.000000
      -55.000000
    
  

8 rows × 128 columns



In [74]:

    
spacing_given









    Out[74]:





array([ 2.00598802,  2.39062405,  2.39062405])



In [ ]:

    
#def resample(image, scan, new_spacing=[1,1,1]):

3D plotting the scan

Use marching cubes to create an approximate mesh for our 3D object, and plot this with matplotlib. Quite slow and ugly, but the best we can do.



In [47]:

    
def plot_3d(image,threshold=-300):
    
    # Position the scan upright,
    # so the head of the patient would be at the top facing the camera
    p = image.transpose(2,1,0)
    
    verts, faces = measure.marching_cubes(p, threshold)
    
    fig = plt.figure(figsize=(10,10))
    ax  = fig.add_subplot(111, projection='3d')
    
    # Fancy indexing: `verts[faces]` to generate a collection of triangles
    mesh = Poly3DCollection(verts[faces], alpha=0.70)
    face_color = [0.45, 0.45, 0.75]
    mesh.set_facecolor(face_color)
    ax.add_collection3d(mesh)
    
    ax.set_xlim(0, p.shape[0])
    ax.set_ylim(0, p.shape[1])
    ax.set_zlim(0, p.shape[2])
    
    plt.show()

Our plot function tkaes a threshold argument which we can use to plot certain structures, such as all tissue or only the bones. 400 is a good threshold for showing the bones only (see Hounsfield unit table above).



In [48]:

    
plot_3d(pix_resampled,400)

Lung segmentation

In order to reduce the problem space, we can segment the lungs (and usually some tissue around it).

The steps:

Threshold the image (-320 HU is a good threshold, but it doesn't matter much for this approach)
Do connected components, determine label of air around person, fill this with 1s in the binary image
Optionally: For every axial slice in the scan, determine the largest solid connected component (the body+air around the person), and set others to 0. This fills the structures in the lungs in the mask.
Keep only the largest air pocket (the human body has other pockets of air here and there).



In [92]:

    
help(measure.label)









    



Help on function label in module skimage.measure._label:

label(input, neighbors=None, background=None, return_num=False, connectivity=None)
    Label connected regions of an integer array.
    
    Two pixels are connected when they are neighbors and have the same value.
    In 2D, they can be neighbors either in a 1- or 2-connected sense.
    The value refers to the maximum number of orthogonal hops to consider a
    pixel/voxel a neighbor::
    
      1-connectivity      2-connectivity     diagonal connection close-up
    
           [ ]           [ ]  [ ]  [ ]         [ ]
            |               \  |  /             |  <- hop 2
      [ ]--[x]--[ ]      [ ]--[x]--[ ]    [x]--[ ]
            |               /  |  \         hop 1
           [ ]           [ ]  [ ]  [ ]
    
    Parameters
    ----------
    input : ndarray of dtype int
        Image to label.
    neighbors : {4, 8}, int, optional
        Whether to use 4- or 8-"connectivity".
        In 3D, 4-"connectivity" means connected pixels have to share face,
        whereas with 8-"connectivity", they have to share only edge or vertex.
        **Deprecated, use ``connectivity`` instead.**
    background : int, optional
        Consider all pixels with this value as background pixels, and label
        them as 0. By default, 0-valued pixels are considered as background
        pixels.
    return_num : bool, optional
        Whether to return the number of assigned labels.
    connectivity : int, optional
        Maximum number of orthogonal hops to consider a pixel/voxel
        as a neighbor.
        Accepted values are ranging from  1 to input.ndim. If ``None``, a full
        connectivity of ``input.ndim`` is used.
    
    Returns
    -------
    labels : ndarray of dtype int
        Labeled array, where all connected regions are assigned the
        same integer value.
    num : int, optional
        Number of labels, which equals the maximum label index and is only
        returned if return_num is `True`.
    
    Examples
    --------
    >>> import numpy as np
    >>> x = np.eye(3).astype(int)
    >>> print(x)
    [[1 0 0]
     [0 1 0]
     [0 0 1]]
    >>> from skimage.measure import label
    >>> print(label(x, connectivity=1))
    [[1 0 0]
     [0 2 0]
     [0 0 3]]
    
    >>> print(label(x, connectivity=2))
    [[1 0 0]
     [0 1 0]
     [0 0 1]]
    
    >>> print(label(x, background=-1))
    [[1 2 2]
     [2 1 2]
     [2 2 1]]
    
    >>> x = np.array([[1, 0, 0],
    ...               [1, 1, 5],
    ...               [0, 0, 0]])
    
    >>> print(label(x))
    [[1 0 0]
     [1 1 2]
     [0 0 0]]



In [49]:

    
def largest_label_volume(im, bg=-1):
    vals, counts = np.unique(im, return_counts=True)
    
    counts = counts[vals != bg]
    vals = vals[vals != bg]
    
    if len(counts) > 0:
        return vals[np.argmax(counts)]
    else:
        return None
    
def segment_lung_mask(image, fill_lung_structures=True):
    
    # no actually binary, but 1 and 2.
    # 0 is treated as background, which we do not want
    binary_image = np.array(image > -320, dtype=np.int8)+1
    labels=measure.label(binary_image)
    
    # Pick the pixel in the very corner to determine which label is air.
    #   Improvement: Pick multiple background labels from around the patient
    #   More resitant to "trays" on which the patient lays cutting the air
    #   around the person in half
    background_label = labels[0,0,0]
    
    # Fill the air around the person
    binary_image[background_label == labels] = 2
    
    
    # Method of filling the lung structures (that is superior to something like 
    # morphological closing)
    if fill_lung_structures:
        # For every slice we determine the largest solid structure
        for i, axial_slice in enumerate(binary_image):
            axial_slice = axial_slice - 1
            labeling = measure.label(axial_slice)
            l_max = largest_label_volume(labeling, bg=0)
            
            if l_max is not None: # This slice contains some lung
                binary_image[i][labeling != l_max] = 1 
                
    binary_image -= 1 # Make the image actual binary
    binary_image = 1-binary_image # Invert it, lungs are now 1
    
    # Remove other air pockets insided body
    labels = measure.label(binary_image, background=0)
    l_max = largest_label_volume(labels, bg=0)
    if l_max is not None: # There are air pockets
        binary_image[labels != l_max] =0 
        
    return binary_image



In [ ]:

    
# look at bolek's comment
#  l_max = largest_label_volume(labels, bg=-1)



In [ ]:

    
# Gerome Pistre
def largest_label_volume(im, bg=-1):
    vals, counts = np.unique(im, return_counts=True)
    
    counts = counts[vals != bg]
    vals = vals[vals != bg]
    
    biggest=vals[np.argmax(counts)]
    return biggest



In [ ]:

    
def segment_lung_mask(image, fill_lung_structures=True):
    
    # no actually binary, but 1 and 2.
    # 0 is treated as background, which we do not want
    binary_image = np.array(image > -320, dtype=np.int8)+1
    labels=measure.label(binary_image)
    
    # Pick the pixel in the very corner to determine which label is air.
    #   Improvement: Pick multiple background labels from around the patient
    #   More resitant to "trays" on which the patient lays cutting the air
    #   around the person in half
    background_label = labels[0,0,0]
    
    # Fill the air around the person
    binary_image[background_label == labels] = 2
    
    
    # Method of filling the lung structures (that is superior to something like 
    # morphological closing)
    if fill_lung_structures:
        # For every slice we determine the largest solid structure
        for i, axial_slice in enumerate(binary_image):
            axial_slice = axial_slice - 1
            labeling = measure.label(axial_slice)
            l_max = largest_label_volume(labeling, bg=0)
            
            if l_max is not None: # This slice contains some lung
                binary_image[i][labeling != l_max] = 1 
                
    binary_image -= 1 # Make the image actual binary
    binary_image = 1-binary_image # Invert it, lungs are now 1
    
    # Remove other air pockets insided body
    labels = measure.label(binary_image, background=0)
    l_max = largest_label_volume(labels, bg=0)
    if l_max is not None: # There are air pockets
        binary_image[labels != l_max] =0 
        
    return binary_image



In [50]:

    
segmented_lungs = segment_lung_mask(pix_resampled, False)
segmented_lungs_fill = segment_lung_mask(pix_resampled, True)



In [51]:

    
plot_3d(segmented_lungs,0)



In [52]:

    
plot_3d(segmented_lungs_fill,0)



In [53]:

    
plot_3d(segmented_lungs_fill - segmented_lungs,0)



In [91]:

    
plot_3d(pix_resampled,0)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-91-8314a2fbf8b5> in <module>()
----> 1 plot_3d(pix_resampled,0)

<ipython-input-47-22dee27382f2> in plot_3d(image, threshold)
     20     ax.set_zlim(0, p.shape[2])
     21 
---> 22     plt.show()

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/pyplot.pyc in show(*args, **kw)
    251     """
    252     global _show
--> 253     return _show(*args, **kw)
    254 
    255 

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/ipykernel/pylab/backend_inline.pyc in show(close, block)
     34     try:
     35         for figure_manager in Gcf.get_all_fig_managers():
---> 36             display(figure_manager.canvas.figure)
     37     finally:
     38         show._to_draw = []

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/display.pyc in display(*objs, **kwargs)
    156             publish_display_data(data=obj, metadata=metadata)
    157         else:
--> 158             format_dict, md_dict = format(obj, include=include, exclude=exclude)
    159             if not format_dict:
    160                 # nothing to display (e.g. _ipython_display_ took over)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/formatters.pyc in format(self, obj, include, exclude)
    175             md = None
    176             try:
--> 177                 data = formatter(obj)
    178             except:
    179                 # FIXME: log the exception

<decorator-gen-9> in __call__(self, obj)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/formatters.pyc in catch_format_error(method, self, *args, **kwargs)
    220     """show traceback on failed format call"""
    221     try:
--> 222         r = method(self, *args, **kwargs)
    223     except NotImplementedError:
    224         # don't warn on NotImplementedErrors

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/formatters.pyc in __call__(self, obj)
    337                 pass
    338             else:
--> 339                 return printer(obj)
    340             # Finally look for special method names
    341             method = _safe_get_formatter_method(obj, self.print_method)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/pylabtools.pyc in <lambda>(fig)
    224 
    225     if 'png' in formats:
--> 226         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    227     if 'retina' in formats or 'png2x' in formats:
    228         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/pylabtools.pyc in print_figure(fig, fmt, bbox_inches, **kwargs)
    115 
    116     bytes_io = BytesIO()
--> 117     fig.canvas.print_figure(bytes_io, **kw)
    118     data = bytes_io.getvalue()
    119     if fmt == 'svg':

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/backend_bases.pyc in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, **kwargs)
   2242                 orientation=orientation,
   2243                 bbox_inches_restore=_bbox_inches_restore,
-> 2244                 **kwargs)
   2245         finally:
   2246             if bbox_inches and restore_bbox:

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in print_png(self, filename_or_obj, *args, **kwargs)
    543 
    544     def print_png(self, filename_or_obj, *args, **kwargs):
--> 545         FigureCanvasAgg.draw(self)
    546         renderer = self.get_renderer()
    547         original_dpi = renderer.dpi

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in draw(self)
    462 
    463         try:
--> 464             self.figure.draw(self.renderer)
    465         finally:
    466             RendererAgg.lock.release()

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/figure.pyc in draw(self, renderer)
   1141 
   1142             mimage._draw_list_compositing_images(
-> 1143                 renderer, self, dsu, self.suppressComposite)
   1144 
   1145             renderer.close_group('figure')

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/image.pyc in _draw_list_compositing_images(renderer, parent, dsu, suppress_composite)
    137     if not_composite or not has_images:
    138         for zorder, a in dsu:
--> 139             a.draw(renderer)
    140     else:
    141         # Composite any adjacent images together

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/mpl_toolkits/mplot3d/axes3d.pyc in draw(self, renderer)
    269         # Calculate projection of collections and zorder them
    270         zlist = [(col.do_3d_projection(renderer), col) \
--> 271                  for col in self.collections]
    272         zlist.sort(key=itemgetter(0), reverse=True)
    273         for i, (z, col) in enumerate(zlist):

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/mpl_toolkits/mplot3d/art3d.pyc in do_3d_projection(self, renderer)
    640             PolyCollection.set_verts_and_codes(self, segments_2d, codes)
    641         else:
--> 642             PolyCollection.set_verts(self, segments_2d)
    643 
    644         self._facecolors2d = [fc for z, s, fc, ec, idx in z_segments_2d]

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/matplotlib/collections.pyc in set_verts(self, verts, closed)
    939                         xy = np.ma.concatenate([xy, xy[0:1]])
    940                     else:
--> 941                         xy = np.asarray(xy)
    942                         xy = np.concatenate([xy, xy[0:1]])
    943                     codes = np.empty(xy.shape[0], dtype=mpath.Path.code_type)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    480 
    481     """
--> 482     return array(a, dtype, copy=False, order=order)
    483 
    484 def asanyarray(a, dtype=None, order=None):

KeyboardInterrupt:



In [90]:

    
print(type(segmented_lungs));print(type(segmented_lungs_fill));
print(segmented_lungs.shape);print(segmented_lungs_fill.shape)









    



<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(335, 306, 306)
(335, 306, 306)

Normalization

Our values currently range from -1024 to around 2000. Anything above 400 is not interesting to us, as these are simply bones with different radiodensity. Here's some code you can use:



In [54]:

    
# MANUALLY change MIN_BOUND, MAX_BOUND
MIN_BOUND=-1000.0  
MAX_BOUND=400.0

def normalize(image):
    image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
    image[image>1]=1.
    image[image<0]=0.
    return image

Zero centering

As a final preprocessing step, it's advisory to 0 center your data so that your mean value is 0. To do this you simply subtract the mean pixel value from all pixels.

To determine this mean you simply average all images in the whole dataset. If that sounds like a lot of work, we found this to be around 0.25 in the LUNA16 competition.

Warning: Do not zero center with the mean per image (like is done in some kernels on here). The CT scanners are calibrated to return accurate HU measurements. There is no such thing as an image with lower contrast or brightness like in normal pictures.



In [55]:

    
PIXEL_MEAN = 0.25

def zero_center(image):
    image = image - PIXEL_MEAN
    return image



In [ ]:

    
# from rolanddog
# https://www.kaggle.com/gzuidhof/data-science-bowl-2017/full-preprocessing-tutorial
# zero-center, and preserve compressibility

MIN_BOUND = -1000.0
MAX_BOUND = 400.0
PIXEL_MEAN = 0.25
PIXEL_CORR = int((MAX_BOUND -MIN_BOUND)*PIXEL_MEAN) # in this case, 350

def zero_center_w_compress(image):
    image = image - PIXEL_CORR
    return image



In [ ]:

    
def normalize(image):
    image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
    image[image>(1-PIXEL_MEAN)]=1.
    image[image<(0-PIXEL_MEAN)]=0.
    return image



In [159]:

    
def normalize_to_bounds(image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
    image[image>(1-PIXEL_MEAN)]=1.
    image[image<(0-PIXEL_MEAN)]=0.
    return image



In [44]:

    
def normalize(image,MIN_BOUND,MAX_BOUND):
    image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
    image[image>1]=1.
    image[image<0]=0.
    return image



In [160]:

    
def zero_center_w_compress_to_bounds(image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    PIXEL_CORR = int((MAX_BOUND -MIN_BOUND)*PIXEL_MEAN)
    image = image - PIXEL_CORR
    return image



In [45]:

    
def zero_center(image, PIXEL_MEAN):
    image = image - PIXEL_MEAN
    return image



In [65]:

    
import skimage



In [66]:

    
skimage.morphology.binary_dilation









    Out[66]:





<function skimage.morphology.binary.binary_dilation>

General advice from Guido Zuidhof: http://pubs.rsna.org/doi/pdf/10.1148/radiol.11091710



In [ ]:



In [104]:

    
len(first_patient)









    Out[104]:





134



In [105]:

    
%time patients_lst = [load_scan(INPUT_FOLDER+samplepatient) for samplepatient in patients]



In [112]:

    
pd.DataFrame( [len(patient) for patient in patients_lst] ).describe()



In [116]:

    
print( pd.DataFrame( [len(patient) for patient in patients_lst]).mean() )
print( pd.DataFrame( [len(patient) for patient in patients_lst]).median() )









    



0    180.2
dtype: float64
0    167.5
dtype: float64



In [117]:

    
INPUT_FOLDER+patients[0]









    Out[117]:





'./2017datascibowl/sample_images/00cba091fa4ad62cc3200a657aeb957e'



In [119]:

    
len( os.listdir(INPUT_FOLDER+patients[0]) )









    Out[119]:





134



In [120]:

    
len(patients_lst[0])









    Out[120]:





134



In [30]:

    
def slices_per_patient(input_folder_path):
    """ slices_per_patient
    @type input_folder_path : Python string 
    """
    patients = os.listdir(input_folder_path)
    patients.sort()
    
    patient_slices_lst = []
    
    for patient in patients:
        Nz = len( os.listdir(input_folder_path + patient))
        patient_slices_lst.append(Nz)
        
    return patient_slices_lst



In [122]:

    
slices_per_patient_sample = slices_per_patient( INPUT_FOLDER)



In [125]:

    
print(len(slices_per_patient_sample))
pd.DataFrame( slices_per_patient_sample).describe()



In [ ]:

    
# Load the scans in given folder path
def load_scan(path):
    """
    INPUTS/ARGUMENTS 
    ================
    @type path : Python string
    
    @type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset  
    each is a slice of the single patient's lung
    """
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices



In [158]:

    
print(type(patients_resampled));print(len(patients_resampled));
print(type(patients_resampled[0]));print(patients_resampled[0].shape);print(patients_resampled[7].shape)









    



<type 'list'>
20
<type 'numpy.ndarray'>
(167, 128, 128)
(167, 128, 128)



In [93]:

    
os.listdir( os.getcwd() + "/2017datascibowl/sample_images/")









    Out[93]:





['0d2fcf787026fece4e57be167d079383',
 '0a0c32c9e08cc2ea76a71649de56be6d',
 '0de72529c30fe642bc60dcb75c87f6bd',
 '0b20184e0cd497028bdd155d9fb42dc9',
 '00cba091fa4ad62cc3200a657aeb957e',
 '0acbebb8d463b4b9ca88cf38431aac69',
 '0c0de3749d4fe175b7a5098b060982a1',
 '0ca943d821204ceb089510f836a367fd',
 '0d19f1c627df49eb223771c28548350e',
 '0ddeb08e9c97227853422bd71a2a695e',
 '0c98fcb55e3f36d0c2b6507f62f4c5f1',
 '0a38e7597ca26f9374f8ea2770ba870d',
 '0a099f2549429d29b32f349e95fb2244',
 '0c59313f52304e25d5a7dcf9877633b1',
 '0c9d8314f9c69840e25febabb1229fa4',
 '0c37613214faddf8701ca41e6d43f56e',
 '0d941a3ad6c889ac451caf89c46cb92a',
 '0d06d764d3c07572074d468b4cff954f',
 '0bd0e3056cbf23a1cb7f0f0b18446068',
 '0c60f4b87afcb3e2dfa65abbbf3ef2f9']



In [97]:

    
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()



In [98]:

    
first_patient = load_scan(INPUT_FOLDER + patients[0])



In [99]:

    
# from Python list of 2-dim. numpy array/matrix to 3-dim. numpy array/matrix
first_patient_pixels = get_pixels_hu(first_patient)



In [163]:

    
patients_resampled[0][0].shape









    Out[163]:





(128, 128)



In [165]:

    
patients_lst[0][0].shape









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-165-fcda05cde827> in <module>()
----> 1 patients_lst[0][0].shape

/home/topolo/.local/lib/python2.7/site-packages/dicom/dataset.pyc in __getattr__(self, name)
    250         if tag is None:
    251             raise AttributeError("Dataset does not have attribute "
--> 252                                  "'{0:s}'.".format(name))
    253         tag = Tag(tag)
    254         if tag not in self:

AttributeError: Dataset does not have attribute 'shape'.



In [167]:

    
print(type(patients_resampled[0].shape))
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    """ norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that 
    represents all (total) 2-dim. image slices for a single patient"""
    Nz = patient_image.shape[0]
    new_patient_image = []
    for z_idx in range(Nz):
        patient_normed=normalize_to_bounds(patient_image[z_idx],MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
        patient_normed=zero_center_w_compress_to_bounds(patient_normed,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
        new_patient_image.append( patient_normed)
    new_patient_image=np.array(new_patient_image)
    return new_patient_image









    



<type 'tuple'>



In [176]:

    
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    """ norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that 
    represents all (total) 2-dim. image slices for a single patient"""
    Nz = patient_image.shape[0]
    new_patient_image = []
    for z_idx in range(Nz):
        patient_normed=normalize_to_bounds(patient_image[z_idx],MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
        new_patient_image.append( patient_normed)
    new_patient_image=np.array(new_patient_image)
    return new_patient_image



In [42]:

    
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    """ norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that 
    represents all (total) 2-dim. image slices for a single patient"""
    Nz = patient_image.shape[0]
    new_patient_image = []
    for z_idx in range(Nz):
        patient_normed=normalize(patient_image[z_idx],MIN_BOUND,MAX_BOUND)
        patient_normed=zero_center(patient_normed,PIXEL_MEAN)
        new_patient_image.append( patient_normed)
    new_patient_image=np.array(new_patient_image)
    return new_patient_image



In [169]:

    
print(type(patients_resampled))









    



<type 'list'>



In [185]:

    
print(len( patients_resampled_normed ))
print(pd.DataFrame(patients_resampled_normed[0][0]).describe())
pd.DataFrame(patients_resampled_normed[4][6]).describe()









    



20
              0           1           2           3           4           5    \
count  128.000000  128.000000  128.000000  128.000000  128.000000  128.000000   
mean    -0.212516   -0.147667   -0.116010   -0.083693   -0.058500   -0.040557   
std      0.146754    0.225505    0.246671    0.265201    0.275713    0.283895   
min     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
25%     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
50%     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
75%     -0.250000   -0.250000   -0.216000    0.307167    0.315500    0.334333   
max      0.418000    0.452000    0.406000    0.410667    0.423333    0.394667   

              6           7           8           9       ...             118  \
count  128.000000  128.000000  128.000000  128.000000     ...      128.000000   
mean    -0.022510   -0.002526    0.014297    0.035125     ...        0.025484   
std      0.290399    0.298349    0.301242    0.313253     ...        0.314878   
min     -0.250000   -0.250000   -0.250000   -0.250000     ...       -0.250000   
25%     -0.250000   -0.250000   -0.250000   -0.250000     ...       -0.250000   
50%     -0.250000   -0.250000   -0.250000   -0.236000     ...       -0.243667   
75%      0.331000    0.349500    0.353333    0.364000     ...        0.361667   
max      0.427333    0.414000    0.426667    0.515333     ...        0.488000   

              119         120         121         122         123         124  \
count  128.000000  128.000000  128.000000  128.000000  128.000000  128.000000   
mean     0.008688   -0.009792   -0.023198   -0.044302   -0.060365   -0.087927   
std      0.306360    0.298038    0.291064    0.283720    0.276820    0.263664   
min     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
25%     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
50%     -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   -0.250000   
75%      0.349000    0.345000    0.341500    0.336000    0.333167    0.304833   
max      0.502667    0.454667    0.416667    0.402667    0.410667    0.404000   

              125         126         127  
count  128.000000  128.000000  128.000000  
mean    -0.117469   -0.151094   -0.212516  
std      0.247553    0.222542    0.145872  
min     -0.250000   -0.250000   -0.250000  
25%     -0.250000   -0.250000   -0.250000  
50%     -0.250000   -0.250000   -0.250000  
75%     -0.234833   -0.250000   -0.250000  
max      0.421333    0.450000    0.380000  

[8 rows x 128 columns]






    Out[185]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      118
      119
      120
      121
      122
      123
      124
      125
      126
      127
    
  
  
    
      count
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      ...
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
      128.000000
    
    
      mean
      -0.230599
      -0.230245
      -0.231547
      -0.231187
      -0.230729
      -0.230807
      -0.175568
      -0.144021
      -0.118594
      -0.097125
      ...
      0.000677
      -0.021240
      -0.039714
      -0.066349
      -0.096964
      -0.125469
      -0.170786
      -0.214474
      -0.233083
      -0.232932
    
    
      std
      0.053464
      0.062904
      0.060050
      0.059088
      0.064771
      0.050273
      0.182362
      0.213400
      0.237917
      0.251674
      ...
      0.300971
      0.289309
      0.275276
      0.259251
      0.248110
      0.229557
      0.171742
      0.076303
      0.062620
      0.052833
    
    
      min
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      ...
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
    
    
      25%
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      ...
      -0.249500
      -0.249333
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
    
    
      50%
      -0.248000
      -0.246667
      -0.248000
      -0.247333
      -0.247000
      -0.247667
      -0.243667
      -0.243000
      -0.243333
      -0.241667
      ...
      -0.220333
      -0.221000
      -0.224667
      -0.231000
      -0.238000
      -0.243667
      -0.247000
      -0.245000
      -0.249000
      -0.249000
    
    
      75%
      -0.234500
      -0.232333
      -0.235667
      -0.236167
      -0.236333
      -0.231833
      -0.209500
      -0.196500
      -0.187667
      -0.177833
      ...
      0.360833
      0.350167
      0.336333
      0.150333
      -0.075000
      -0.174167
      -0.208167
      -0.218667
      -0.237333
      -0.237333
    
    
      max
      0.214667
      0.423333
      0.372667
      0.354000
      0.448667
      0.194000
      0.483333
      0.444667
      0.447333
      0.461333
      ...
      0.508667
      0.451333
      0.463333
      0.450667
      0.445333
      0.444667
      0.424000
      0.403333
      0.427333
      0.165333
    
  

8 rows × 128 columns



In [186]:

    
os.listdir('./2017datascibowl/')









    Out[186]:





['stage1_sample_submission.csv',
 'stage1_sample_submission.csv.zip',
 'stage1_labels.csv',
 'stage1_labels.csv.zip',
 'sample_images']



In [192]:

    
yhat_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')



In [193]:

    
yhat_ids.head()









    Out[193]:






  
    
      
      id
      cancer
    
  
  
    
      0
      0015ceb851d7251b8f399e39779d1e7d
      1
    
    
      1
      0030a160d58723ff36d73f41b170ec21
      0
    
    
      2
      003f41c78e6acfa92430a057ac0b306e
      0
    
    
      3
      006b96310a37b36cccb2ab48d10b49a3
      1
    
    
      4
      008464bb8521d09a42985dd8add3d0d2
      1



In [201]:

    
yhat_ids_found = yhat_ids.loc[yhat_ids['id'].isin(patients)]



In [209]:

    
yhat_ids_found;



In [212]:

    
#yhat_ids_found['id']-pd.DataFrame(patients)



In [218]:

    
Set( yhat_ids_found['id']).symmetric_difference(patients)









    Out[218]:





Set(['0b20184e0cd497028bdd155d9fb42dc9'])



In [225]:

    
yhat_ids['id'].iloc[:70];



In [231]:

    
pd.DataFrame(patients)









    Out[231]:






  
    
      
      0
    
  
  
    
      0
      00cba091fa4ad62cc3200a657aeb957e
    
    
      1
      0a099f2549429d29b32f349e95fb2244
    
    
      2
      0a0c32c9e08cc2ea76a71649de56be6d
    
    
      3
      0a38e7597ca26f9374f8ea2770ba870d
    
    
      4
      0acbebb8d463b4b9ca88cf38431aac69
    
    
      5
      0b20184e0cd497028bdd155d9fb42dc9
    
    
      6
      0bd0e3056cbf23a1cb7f0f0b18446068
    
    
      7
      0c0de3749d4fe175b7a5098b060982a1
    
    
      8
      0c37613214faddf8701ca41e6d43f56e
    
    
      9
      0c59313f52304e25d5a7dcf9877633b1
    
    
      10
      0c60f4b87afcb3e2dfa65abbbf3ef2f9
    
    
      11
      0c98fcb55e3f36d0c2b6507f62f4c5f1
    
    
      12
      0c9d8314f9c69840e25febabb1229fa4
    
    
      13
      0ca943d821204ceb089510f836a367fd
    
    
      14
      0d06d764d3c07572074d468b4cff954f
    
    
      15
      0d19f1c627df49eb223771c28548350e
    
    
      16
      0d2fcf787026fece4e57be167d079383
    
    
      17
      0d941a3ad6c889ac451caf89c46cb92a
    
    
      18
      0ddeb08e9c97227853422bd71a2a695e
    
    
      19
      0de72529c30fe642bc60dcb75c87f6bd



In [245]:

    
# pd.concat([pd.DataFrame(patients), yhat_ids_found['id']],join="inner",ignore_index=True)
patients[0] in yhat_ids_found['id'].as_matrix()









    Out[245]:





True



In [256]:

    
m = len(patients)
found_indices =[]
for i in range(m):
    if patients[i] in yhat_ids_found['id'].as_matrix():
        found_indices.append(i)



In [257]:

    
patients_resampled_found = [patients[i] for i in found_indices]



In [263]:

    
for i in range(len(found_indices)):
    print(patients_resampled_found[i] in yhat_ids_found['id'].as_matrix())









    



True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True



In [264]:

    
y_found=[]
for i in range(len(found_indices)):
    if (patients_resampled_found[i] in yhat_ids_found['id'].as_matrix()):
        y_found.append( yhat_ids_found['cancer'] )



In [294]:

    
os.listdir(os.getcwd())









    Out[294]:





['2017datascibowl',
 'data_password.txt.zip',
 'data_password.txt',
 'stage1.7z',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'glass-classification.zip',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'sample_images.7z',
 'LSTM_model201702282350.save',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'train.h5.zip',
 '__init__.py',
 'train.h5',
 'stage1.torrent',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'cleaning_dueSigmaFin.py']

Summary



In [126]:

    
# Some constants
INPUT_FOLDER = './2017datascibowl/sample_images/'
patients=os.listdir(INPUT_FOLDER)
patients.sort()



In [133]:

    
Nz_lst = slices_per_patient(INPUT_FOLDER)
Nz_tot= int( pd.DataFrame(Nz_lst).median() )



In [134]:

    
%time patients_lst = [load_scan(INPUT_FOLDER+samplepatient) for samplepatient in patients]









    Out[134]:





167



In [136]:

    
%time patients_pixels = [get_pixels_hu(patient) for patient in patients_lst]









    



CPU times: user 4.23 s, sys: 450 ms, total: 4.68 s
Wall time: 5.94 s



In [140]:

    
pixels_and_patients=zip(patients_pixels,patients_lst)



In [152]:

    
Nx=128
Ny=128
# pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])
%time patients_resampled,spacings_new=map(list, \
                                          zip(*[resample_given_dims(patient_pixels,\
                                                                    patient,\
                                                                  [Nz_tot,Nx,Ny]) for patient_pixels,patient in pixels_and_patients]))









    



CPU times: user 1min 9s, sys: 4.83 s, total: 1min 14s
Wall time: 1min 14s



In [168]:

    
MIN_BOUND=-1000.0
MAX_BOUND=500.0
PIXEL_MEAN=0.25



In [182]:

    
patients_resampled_normed = [norm_zero_images(patient_image,\
                  MIN_BOUND,\
                  MAX_BOUND,\
                  PIXEL_MEAN) for patient_image in patients_resampled]



In [292]:

    
y_found









    Out[292]:





array([0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])



In [290]:

    
print(len(patients_resampled_normed_found))

Get the y value (outcomes), by matching patient IDs



In [265]:

    
os.listdir('./2017datascibowl/');
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')



In [266]:

    
y_ids_found=y_ids.loc[yhat_ids['id'].isin(patients)]



In [267]:

    
m = len(patients)
found_indices =[]
for i in range(m):
    if patients[i] in y_ids_found['id'].as_matrix():
        found_indices.append(i)



In [268]:

    
patients_resampled_found = [patients[i] for i in found_indices]



In [282]:

    
y_found=[]
for i in range(len(found_indices)):
    if (patients_resampled_found[i] in yhat_ids_found['id'].as_matrix()):
        cancer_val = y_ids_found.loc[y_ids_found['id']==patients_resampled_found[i]]['cancer'].as_matrix()
        y_found.append( cancer_val )



In [288]:

    
y_found=np.array(y_found).flatten()



In [289]:

    
patients_resampled_normed_found=[patients_resampled_normed[idx] for idx in found_indices]



In [275]:

    
patients_resampled_found[2]









    Out[275]:





'0a0c32c9e08cc2ea76a71649de56be6d'



In [287]:

    
np.array(y_found).flatten()









    Out[287]:





array([0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])

Process Whole



In [2]:

    
os.listdir('./2017datascibowl/')









    Out[2]:





['stage1_sample_submission.csv',
 'stage1_sample_submission.csv.zip',
 'stage1_labels.csv',
 'stage1_labels.csv.zip',
 'stage1',
 'sample_images']



In [4]:

    
stage1_labels_csv = pd.read_csv("./2017datascibowl/stage1_labels.csv")



In [7]:

    
stage1_labels_csv.head()









    Out[7]:






  
    
      
      id
      cancer
    
  
  
    
      0
      0015ceb851d7251b8f399e39779d1e7d
      1
    
    
      1
      0030a160d58723ff36d73f41b170ec21
      0
    
    
      2
      003f41c78e6acfa92430a057ac0b306e
      0
    
    
      3
      006b96310a37b36cccb2ab48d10b49a3
      1
    
    
      4
      008464bb8521d09a42985dd8add3d0d2
      1



In [8]:

    
stage1_labels_csv.describe()









    Out[8]:






  
    
      
      cancer
    
  
  
    
      count
      1397.000000
    
    
      mean
      0.259127
    
    
      std
      0.438312
    
    
      min
      0.000000
    
    
      25%
      0.000000
    
    
      50%
      0.000000
    
    
      75%
      1.000000
    
    
      max
      1.000000



In [12]:

    
stage1_labels_csv.as_matrix().shape









    Out[12]:





(1397, 2)



In [9]:

    
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")



In [10]:

    
stage1_sample_submission_csv.head()









    Out[10]:






  
    
      
      id
      cancer
    
  
  
    
      0
      026470d51482c93efc18b9803159c960
      0.5
    
    
      1
      031b7ec4fe96a3b035a8196264a8c8c3
      0.5
    
    
      2
      03bd22ed5858039af223c04993e9eb22
      0.5
    
    
      3
      06a90409e4fcea3e634748b967993531
      0.5
    
    
      4
      07b1defcfae5873ee1f03c90255eb170
      0.5



In [11]:

    
stage1_sample_submission_csv.describe()



In [13]:

    
stage1_sample_submission_csv.as_matrix().shape









    Out[13]:





(198, 2)



In [14]:

    
stage1_labels_csv.cancer.median()









    Out[14]:





0.0



In [15]:

    
stage1_sample_submission_csv["cancer"]









    Out[15]:





0      0.5
1      0.5
2      0.5
3      0.5
4      0.5
5      0.5
6      0.5
7      0.5
8      0.5
9      0.5
10     0.5
11     0.5
12     0.5
13     0.5
14     0.5
15     0.5
16     0.5
17     0.5
18     0.5
19     0.5
20     0.5
21     0.5
22     0.5
23     0.5
24     0.5
25     0.5
26     0.5
27     0.5
28     0.5
29     0.5
      ... 
168    0.5
169    0.5
170    0.5
171    0.5
172    0.5
173    0.5
174    0.5
175    0.5
176    0.5
177    0.5
178    0.5
179    0.5
180    0.5
181    0.5
182    0.5
183    0.5
184    0.5
185    0.5
186    0.5
187    0.5
188    0.5
189    0.5
190    0.5
191    0.5
192    0.5
193    0.5
194    0.5
195    0.5
196    0.5
197    0.5
Name: cancer, dtype: float64

Stage 1 File I/O



In [17]:

    
patients_stage1 = os.listdir('./2017datascibowl/stage1')



In [18]:

    
print(len(patients_stage1))

Stage 1 labels



In [19]:

    
stage1_labels_csv["id"].describe()









    Out[19]:





count                                 1397
unique                                1397
top       733205c5d0bbf19f5c761e0c023bf9a0
freq                                     1
Name: id, dtype: object



In [21]:

    
stage1_labels_csv["id"].as_matrix()









    Out[21]:





array(['0015ceb851d7251b8f399e39779d1e7d',
       '0030a160d58723ff36d73f41b170ec21',
       '003f41c78e6acfa92430a057ac0b306e', ...,
       'fe5c37e82b412833b8ad0abb57978377',
       'ff5d8e90500cf324e7b04a2f07cf0399',
       'ffe02fe7d2223743f7fb455dfaff3842'], dtype=object)



In [23]:

    
len( set(patients_stage1).difference(set(stage1_labels_csv["id"].as_matrix())) )









    Out[23]:





198



In [24]:

    
set(stage1_labels_csv["id"].as_matrix()).difference( set(patients_stage1))









    Out[24]:





set()



In [26]:

    
len( set(patients_stage1).symmetric_difference(set(stage1_labels_csv["id"].as_matrix()))  )









    Out[26]:





198



In [28]:

    
len(patients_stage1)









    Out[28]:





1595



In [31]:

    
Nz_lst = slices_per_patient('./2017datascibowl/stage1/')
Nz_tot= int( pd.DataFrame(Nz_lst).median() )



In [35]:

    
patient_0 = load_scan('./2017datascibowl/stage1/'+patients_stage1[0])



In [38]:

    
patient_pixels_0 = get_pixels_hu(patient_0)



In [41]:

    
Nx=128
Ny=128
# pix_resampled_given,spacing_given=resample_given_dims( first_patient_pixels, first_patient, [167,128,128])
%time patient_resampled_0,spacing_new_0=resample_given_dims(patient_pixels_0,\
                                                                    patient_0,\
                                                                  [Nz_tot,Nx,Ny])









    



CPU times: user 2.64 s, sys: 0 ns, total: 2.64 s
Wall time: 2.53 s



In [47]:

    
MIN_BOUND=-1000.0
MAX_BOUND=500.0
PIXEL_MEAN=0.25

patient_resampled_norm_0=norm_zero_images(patient_resampled_0,\
                  MIN_BOUND,\
                  MAX_BOUND,\
                  PIXEL_MEAN)



In [51]:

    
patient_resampled_norm_0.flatten().shape









    Out[51]:





(2621440,)



In [50]:

    
spacing_new_0









    Out[50]:





array([ 1.75      ,  3.20312405,  3.20312405])



In [65]:

    
print( patient_0[0].ImagePositionPatient )
print( patient_0[0].ImageOrientationPatient )
print( patient_0[1].ImagePositionPatient )
print( patient_0[1].ImageOrientationPatient )
print( patient_0[2].ImagePositionPatient )
print( patient_0[2].ImageOrientationPatient )









    



['-202.899994', '-238.300003', '-294.119995']
['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']
['-202.899994', '-238.300003', '-291.619995']
['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']
['-202.899994', '-238.300003', '-289.119995']
['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']



In [66]:

    
patient_0[0].ImageOrientationPatient









    Out[66]:





['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']



In [54]:

    
patient_0[1]









    Out[54]:





(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113654.2.55.300863971221291381526736132231562757912
(0008, 0060) Modality                            CS: 'CT'
(0008, 103e) Series Description                  LO: 'Axial'
(0010, 0010) Patient's Name                      PN: '29883252ce0ce3e052ce8cf4932623af'
(0010, 0020) Patient ID                          LO: '29883252ce0ce3e052ce8cf4932623af'
(0010, 0030) Patient's Birth Date                DA: '19000101'
(0018, 0050) Slice Thickness                     DS: '2.5'
(0018, 0060) KVP                                 DS: ''
(0020, 000d) Study Instance UID                  UI: 2.25.51329352781293882739991701446693132102457555138147446375828
(0020, 000e) Series Instance UID                 UI: 2.25.41601713747120404656977043488053459221091478746332764467723
(0020, 0011) Series Number                       IS: '2'
(0020, 0012) Acquisition Number                  IS: '1'
(0020, 0013) Instance Number                     IS: '111'
(0020, 0020) Patient Orientation                 CS: ''
(0020, 0032) Image Position (Patient)            DS: ['-202.899994', '-238.300003', '-291.619995']
(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']
(0020, 0052) Frame of Reference UID              UI: 2.25.89782087814212394559238020497773895608588481827041446535689
(0020, 1040) Position Reference Indicator        LO: 'SN'
(0020, 1041) Slice Location                      DS: '-291.619995'
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 512
(0028, 0011) Columns                             US: 512
(0028, 0030) Pixel Spacing                       DS: ['0.800781', '0.800781']
(0028, 0100) Bits Allocated                      US: 16
(0028, 0101) Bits Stored                         US: 16
(0028, 0102) High Bit                            US: 15
(0028, 0103) Pixel Representation                US: 1
(0028, 0120) Pixel Padding Value                 US: 63536
(0028, 1050) Window Center                       DS: '40'
(0028, 1051) Window Width                        DS: '400'
(0028, 1052) Rescale Intercept                   DS: '-1024'
(0028, 1053) Rescale Slope                       DS: '1'
(7fe0, 0010) Pixel Data                          OW: Array of 524288 bytes



In [61]:

    
print(patient_0[0].ImagePositionPatient)
print(patient_0[0].SliceThickness)
print(patient_0[0].PixelSpacing)
dir(patient_0[0]);









    



['-202.899994', '-238.300003', '-294.119995']
2.5
['0.800781', '0.800781']



In [62]:

    
print(patient_0[0].ImageOrientationPatient)









    



['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']



In [63]:

    
patient_0[0]









    Out[63]:





(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113654.2.55.158257041710393617309832979737744229653
(0008, 0060) Modality                            CS: 'CT'
(0008, 103e) Series Description                  LO: 'Axial'
(0010, 0010) Patient's Name                      PN: '29883252ce0ce3e052ce8cf4932623af'
(0010, 0020) Patient ID                          LO: '29883252ce0ce3e052ce8cf4932623af'
(0010, 0030) Patient's Birth Date                DA: '19000101'
(0018, 0050) Slice Thickness                     DS: '2.5'
(0018, 0060) KVP                                 DS: ''
(0020, 000d) Study Instance UID                  UI: 2.25.51329352781293882739991701446693132102457555138147446375828
(0020, 000e) Series Instance UID                 UI: 2.25.41601713747120404656977043488053459221091478746332764467723
(0020, 0011) Series Number                       IS: '2'
(0020, 0012) Acquisition Number                  IS: '1'
(0020, 0013) Instance Number                     IS: '112'
(0020, 0020) Patient Orientation                 CS: ''
(0020, 0032) Image Position (Patient)            DS: ['-202.899994', '-238.300003', '-294.119995']
(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']
(0020, 0052) Frame of Reference UID              UI: 2.25.89782087814212394559238020497773895608588481827041446535689
(0020, 1040) Position Reference Indicator        LO: 'SN'
(0020, 1041) Slice Location                      DS: '-294.119995'
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 512
(0028, 0011) Columns                             US: 512
(0028, 0030) Pixel Spacing                       DS: ['0.800781', '0.800781']
(0028, 0100) Bits Allocated                      US: 16
(0028, 0101) Bits Stored                         US: 16
(0028, 0102) High Bit                            US: 15
(0028, 0103) Pixel Representation                US: 1
(0028, 0120) Pixel Padding Value                 US: 63536
(0028, 1050) Window Center                       DS: '40'
(0028, 1051) Window Width                        DS: '400'
(0028, 1052) Rescale Intercept                   DS: '-1024'
(0028, 1053) Rescale Slope                       DS: '1'
(7fe0, 0010) Pixel Data                          OW: Array of 524288 bytes



In [68]:

    
np.array(patient_0[0].ImagePositionPatient[:2])









    Out[68]:





array([-202.899994, -238.300003])



In [67]:

    
np.array(patient_0[0].ImageOrientationPatient)









    Out[67]:





array([ 1.,  0.,  0.,  0.,  1.,  0.])



In [71]:

    
patient_resampled_norm_0.flatten()









    Out[71]:





array([-0.25, -0.25, -0.25, ..., -0.25, -0.25, -0.25])



In [70]:

    
np.concatenate([patient_resampled_norm_0.flatten(),
                np.array(patient_0[0].ImagePositionPatient[:2]),
                np.array( patient_0[0].ImageOrientationPatient) ] ).shape









    Out[70]:





(2621448,)



In [54]:

    
def process_patient(patientname,    
                                MIN_BOUND=-1000.0,
                                MAX_BOUND=500.0,
                                PIXEL_MEAN=0.25,
                                INPUT_FOLDER='./2017datascibowl/stage1/',
                                N_x=128,N_y=128,):
    """ process_patient - process single patient """
    patient = load_scan(INPUT_FOLDER +patientname)
    
    Nz_lst = slices_per_patient( INPUT_FOLDER )
    print( "Nz_lst: %d" % len(Nz_lst))
    Nz_tot= int( pd.DataFrame(Nz_lst).median() )
    print( "\n Nz_tot : %d" % Nz_tot)
    patient_pixels = get_pixels_hu(patient)
    print( patient_pixels.shape)
    
    patient_resampled,spacing_new=resample_given_dims(patient_pixels,patient,[Nz_tot,N_x,N_y]) 
    print(patient_resampled.shape)
    patient_resampled_norm=norm_zero_images(patient_resampled,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
    
    patient_feature_vec = np.concatenate([patient_resampled_norm.flatten(),
                                            np.array(patient[0].ImagePositionPatient[:2]),
                                            np.array( patient[0].ImageOrientationPatient) ] )
    return patient_feature_vec



In [41]:

    
def save_feat_vec(patient_feature_vec,patientname,sub_name="stage1_feat"):
#def save_feat_vec(patient_feature_vec,patientname): original
    #    f=file( "./2017datascibowl/stage1_feat/"+patientname + "feat_vec" ,"wb")
    f=file( "./2017datascibowl/"+sub_name+"/"+patientname + "feat_vec" ,"wb")
    np.save(f,patient_feature_vec)
    f.close()



In [55]:

    
patient0_feature_vec = process_patient(patients_stage1[0])









    



Nz_lst: 1595

 Nz_tot : 160
(112, 512, 512)
(112, 0, 0)



In [43]:

    
save_feat_vec( patient0_feature_vec, patients_stage1[0])



In [90]:

    
os.getcwd()









    Out[90]:





'/home/topolo/PropD/MLgrabbag/kaggle'



In [44]:

    
patient0_feature_vec









    Out[44]:





array([-202.899994, -238.300003,    1.      ,    0.      ,    0.      ,
          0.      ,    1.      ,    0.      ])



In [93]:

    
test_patient0_feat_vec=np.load("./2017datascibowl/stage1_feat/"+patients_stage1[0]+"feat_vec")



In [94]:

    
test_patient0_feat_vec









    Out[94]:





array([-0.25, -0.25, -0.25, ...,  0.  ,  1.  ,  0.  ])



In [95]:

    
test_patient0_feat_vec.shape









    Out[95]:





(2621448,)



In [96]:

    
for patient_name in patients_stage1:
    patient_feature_vec = process_patient(patient_name)
    save_feat_vec( patient_feature_vec, patient_name)









    



/home/topolo/Public/anaconda2/lib/python2.7/site-packages/scipy/ndimage/interpolation.py:568: UserWarning: From scipy 0.13.0, the output shape of zoom() is calculated with round() instead of int() - for these inputs the size of the returned array has changed.
  "the returned array has changed.", UserWarning)



In [97]:

    
len(patients_stage1)









    Out[97]:





1595



In [98]:

    
patients_stage1_feat = os.listdir('./2017datascibowl/stage1_feat')
print(len(patients_stage1_feat))

Stage 1 Low-Res Preprocess



In [2]:

    
# Load the scans in given folder path
def load_scan(path):
    """
    INPUTS/ARGUMENTS 
    ================
    @type path : Python string
    
    @type slices : Python list (for each file in a folder/directory of dicom.dataset.FileDataset  
    each is a slice of the single patient's lung
    """
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices



In [3]:

    
def slices_per_patient(input_folder_path):
    """ slices_per_patient
    @type input_folder_path : Python string 
    """
    patients = os.listdir(input_folder_path)
    patients.sort()
    
    patient_slices_lst = []
    
    for patient in patients:
        Nz = len( os.listdir(input_folder_path + patient))
        patient_slices_lst.append(Nz)
        
    return patient_slices_lst



In [4]:

    
def get_pixels_hu(slices):
    """
    INPUTS/ARGUMENTS
    ================
    @type slices : Python list of dicom.dataset.FileDataset, 
    @param slices : each dicom.dataset.FileDataset representing an image "slice" of a single patient's lung
    """
    image = np.stack([s.pixel_array for s in slices]) # np.array image.shape (134,512,512)
    # Convert to int16 (from sometimes int16),
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)
    
    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    # suggested by Henry Wolf to avoid -2048 values
    outside_of_image_val = image.min()
    image[image == outside_of_image_val] = 0 
    
    image[image == -2000] = 0 
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
        
    return np.array(image, dtype=np.int16)



In [5]:

    
def resample_given_dims(image, scan, new_shape):
    # Determine current pixel spacing
    spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)  # (\Delta z,\Delta x,\Delta y)
    
    real_resize_factor=new_shape/np.array(image.shape).astype("float32")  
    
    new_spacing = spacing/real_resize_factor # (\Delta z',\Delta x',\Delta y')
        
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    
    return image, new_spacing



In [6]:

    
def norm_zero_images(patient_image,MIN_BOUND,MAX_BOUND,PIXEL_MEAN=0.25):
    """ norm_zero_images - due normalization and zero center on a 3-dim. numpy array/matrix that 
    represents all (total) 2-dim. image slices for a single patient"""
    Nz = patient_image.shape[0]
    new_patient_image = []
    for z_idx in range(Nz):
        patient_normed=normalize(patient_image[z_idx],MIN_BOUND,MAX_BOUND)
        patient_normed=zero_center(patient_normed,PIXEL_MEAN)
        new_patient_image.append( patient_normed)
    new_patient_image=np.array(new_patient_image)
    return new_patient_image



In [7]:

    
def normalize(image,MIN_BOUND,MAX_BOUND):
    image = (image - MIN_BOUND)/(MAX_BOUND-MIN_BOUND)
    image[image>1]=1.
    image[image<0]=0.
    return image



In [8]:

    
def zero_center(image, PIXEL_MEAN):
    image = image - PIXEL_MEAN
    return image



In [9]:

    
def process_patient(patientname,    
                                MIN_BOUND=-1000.0,
                                MAX_BOUND=500.0,
                                PIXEL_MEAN=0.25,
                                INPUT_FOLDER='./2017datascibowl/stage1/',
                                N_x=128,N_y=128,):
    """ process_patient - process single patient """
    patient = load_scan(INPUT_FOLDER +patientname)
    
    Nz_lst = slices_per_patient( INPUT_FOLDER )
    Nz_tot= int( pd.DataFrame(Nz_lst).median() )
    patient_pixels = get_pixels_hu(patient)
    
    patient_resampled,spacing_new=resample_given_dims(patient_pixels,patient,[Nz_tot,N_x,N_y]) 
    patient_resampled_norm=norm_zero_images(patient_resampled,MIN_BOUND,MAX_BOUND,PIXEL_MEAN)
    
    patient_feature_vec = np.concatenate([patient_resampled_norm.flatten(),
                                            np.array(patient[0].ImagePositionPatient[:2]),
                                            np.array( patient[0].ImageOrientationPatient) ] )
    return patient_feature_vec



In [10]:

    
def save_feat_vec(patient_feature_vec,patientname,sub_name="stage1_feat"):
#def save_feat_vec(patient_feature_vec,patientname): original
    #    f=file( "./2017datascibowl/stage1_feat/"+patientname + "feat_vec" ,"wb")
    f=file( "./2017datascibowl/"+sub_name+"/"+patientname + "feat_vec" ,"wb")
    np.save(f,patient_feature_vec)
    f.close()



In [11]:

    
import os, sys
os.getcwd()
os.listdir( os.getcwd() )









    Out[11]:





['2017datascibowl',
 'data_password.txt',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'LSTM_model201702282350.save',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'DatSciBow2017_SVM.ipynb',
 '__init__.py',
 'train.h5',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'cleaning_dueSigmaFin.py']



In [12]:

    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
import scipy.ndimage
import matplotlib.pyplot as plt



In [13]:

    
patients_stage1 = os.listdir('./2017datascibowl/stage1')
print(len(patients_stage1))



In [14]:

    
N_x=32
N_y=32



In [15]:

    
%time 
for patient_name in patients_stage1:
    patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
    save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres32")









    



CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11.2 µs






    



/home/topolo/Public/anaconda2/lib/python2.7/site-packages/scipy/ndimage/interpolation.py:568: UserWarning: From scipy 0.13.0, the output shape of zoom() is calculated with round() instead of int() - for these inputs the size of the returned array has changed.
  "the returned array has changed.", UserWarning)



In [117]:

    
%time 
for patient_name in patients_stage1:
    patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
    save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres32")









    



CPU times: user 12 µs, sys: 1 µs, total: 13 µs
Wall time: 13.8 µs






    



/home/topolo/Public/anaconda2/lib/python2.7/site-packages/scipy/ndimage/interpolation.py:568: UserWarning: From scipy 0.13.0, the output shape of zoom() is calculated with round() instead of int() - for these inputs the size of the returned array has changed.
  "the returned array has changed.", UserWarning)



In [16]:

    
import time



In [17]:

    
N_x=64
N_y=64
t1 = time.time()
for patient_name in patients_stage1:
    patient_feature_vec = process_patient(patient_name,N_x=N_x,N_y=N_y)
    save_feat_vec( patient_feature_vec, patient_name,sub_name="stage1_feat_lowres"+str(N_x))
t2=time.time()
print((t2-t1)*1000*1000.) # in seconds









    



/home/topolo/Public/anaconda2/lib/python2.7/site-packages/scipy/ndimage/interpolation.py:568: UserWarning: From scipy 0.13.0, the output shape of zoom() is calculated with round() instead of int() - for these inputs the size of the returned array has changed.
  "the returned array has changed.", UserWarning)






    



8353555423.97



In [115]:

    
patient_feature_vec_test = process_patient(patients_stage1[0],N_x=N_x,N_y=N_y)



In [116]:

    
patient_feature_vec_test.shape









    Out[116]:





(40968,)



In [ ]:

	0	1	2	3	4	5	6	7	8	9	...	502	503	504	505	506	507	508	509	510	511
count	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	...	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000	512.000000
mean	-1817.480469	-1687.929688	-1600.953125	-1523.867188	-1467.464844	-1413.097656	-1359.531250	-1316.011719	-1271.882812	-1227.421875	...	-1234.949219	-1278.269531	-1321.933594	-1366.171875	-1419.917969	-1472.910156	-1527.789062	-1602.140625	-1686.277344	-1816.578125
std	707.955168	893.832547	991.304186	1065.634957	1114.995762	1155.741711	1191.352015	1217.971285	1243.673746	1267.897664	...	1257.301007	1234.392217	1209.006865	1180.214144	1143.906186	1104.989222	1057.894768	988.719528	897.840990	711.176932
min	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	...	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000
25%	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	...	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000
50%	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	...	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000
75%	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	58.500000	178.000000	...	101.500000	28.500000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000	-2000.000000
max	1062.000000	1090.000000	1068.000000	1076.000000	1076.000000	1084.000000	1118.000000	1110.000000	1118.000000	1070.000000	...	1070.000000	1040.000000	1046.000000	1108.000000	1032.000000	1076.000000	1092.000000	1070.000000	1018.000000	1038.000000

	0	1	2	3	4	5	6	7	8	9	...	118	119	120	121	122	123	124	125	126	127
count	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	...	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000
mean	-965.992188	-865.835938	-816.781250	-766.789062	-728.289062	-700.179688	-672.375000	-641.742188	-615.875000	-584.078125	...	-598.710938	-624.515625	-652.859375	-673.984375	-706.140625	-731.085938	-773.171875	-818.945312	-871.093750	-965.960938
std	225.862947	347.119594	379.754499	408.096921	424.471235	436.535383	446.371901	458.378780	462.784894	480.708853	...	482.882147	470.250930	457.787606	447.755100	436.335487	425.987265	405.612458	380.930219	342.545921	224.568261
min	-1024.000000	-1028.000000	-1029.000000	-1025.000000	-1042.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1039.000000	...	-1032.000000	-1030.000000	-1028.000000	-1026.000000	-1025.000000	-1043.000000	-1025.000000	-1029.000000	-1028.000000	-1024.000000
25%	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	...	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000
50%	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1023.000000	-979.000000	...	-990.500000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000	-1024.000000
75%	-1024.000000	-1024.000000	-949.000000	-164.250000	-151.750000	-123.500000	-128.500000	-100.750000	-95.000000	-79.000000	...	-82.500000	-101.500000	-107.500000	-112.750000	-121.000000	-125.250000	-167.750000	-977.250000	-1024.000000	-1024.000000
max	2.000000	53.000000	-16.000000	-9.000000	10.000000	-33.000000	16.000000	-4.000000	15.000000	148.000000	...	107.000000	129.000000	57.000000	0.000000	-21.000000	-9.000000	-19.000000	7.000000	50.000000	-55.000000

	0
count	20.00000
mean	180.20000
std	75.23619
min	110.00000
25%	131.75000
50%	167.50000
75%	197.75000
max	435.00000

	id	cancer
0	0015ceb851d7251b8f399e39779d1e7d	1
1	0030a160d58723ff36d73f41b170ec21	0
2	003f41c78e6acfa92430a057ac0b306e	0
3	006b96310a37b36cccb2ab48d10b49a3	1
4	008464bb8521d09a42985dd8add3d0d2	1

	0
0	00cba091fa4ad62cc3200a657aeb957e
1	0a099f2549429d29b32f349e95fb2244
2	0a0c32c9e08cc2ea76a71649de56be6d
3	0a38e7597ca26f9374f8ea2770ba870d
4	0acbebb8d463b4b9ca88cf38431aac69
5	0b20184e0cd497028bdd155d9fb42dc9
6	0bd0e3056cbf23a1cb7f0f0b18446068
7	0c0de3749d4fe175b7a5098b060982a1
8	0c37613214faddf8701ca41e6d43f56e
9	0c59313f52304e25d5a7dcf9877633b1
10	0c60f4b87afcb3e2dfa65abbbf3ef2f9
11	0c98fcb55e3f36d0c2b6507f62f4c5f1
12	0c9d8314f9c69840e25febabb1229fa4
13	0ca943d821204ceb089510f836a367fd
14	0d06d764d3c07572074d468b4cff954f
15	0d19f1c627df49eb223771c28548350e
16	0d2fcf787026fece4e57be167d079383
17	0d941a3ad6c889ac451caf89c46cb92a
18	0ddeb08e9c97227853422bd71a2a695e
19	0de72529c30fe642bc60dcb75c87f6bd

	cancer
count	1397.000000
mean	0.259127
std	0.438312
min	0.000000
25%	0.000000
50%	0.000000
75%	1.000000
max	1.000000

	id	cancer
0	026470d51482c93efc18b9803159c960	0.5
1	031b7ec4fe96a3b035a8196264a8c8c3	0.5
2	03bd22ed5858039af223c04993e9eb22	0.5
3	06a90409e4fcea3e634748b967993531	0.5
4	07b1defcfae5873ee1f03c90255eb170	0.5