End-to-End Biomedical Image Analytics using Machine Learning and Artificial Intelligence for Lung Nodule Classification in SAS Viya

This notebook demonstrates how to build an end-to-end solutions in SAS Viya that can assist with a real-world biomedical image analytics problem, specifically lung nodule classification that is based on 3-D CT images of patient torsos and radiologist annotations of nodule locations. Lung nodules are lumps of dead tissue that commonly occur in humans, less than 5% of which are malignant. Radiologists are responsible for determining whether a nodule visually observed in a patient image is potentially cancerous so that a definitive test such as biopsy is performed for that patient. This demo focuses on nodule shape, one of many factors that radiologists account for in their classification. The basis for a shape-based classification is the irregular protrusions (called spiculations) that commonly exist on the surfaces of malignant nodules. Benign nodules, on the other hand, have smooth and spherical surfaces more often than not. Two solutions, one based on machine learning, and the other on artificial intelligence, are presented.

Imports


In [1]:
import sys
stdout = sys.stdout
stderr = sys.stderr
import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import struct
import swat
import swat.datamsghandlers as dmh
import time
from IPython.display import display
from mayavi import mlab
from sklearn import model_selection, metrics
from tabulate import tabulate
from IPython.display import HTML
from random import randint
sys.stdout = stdout
sys.stderr = stderr

%matplotlib inline

Server and Data Locations

CAS Server


In [5]:
cashost = 'localhost'
casport = 11943

Data Locations (Please see slide 6 below for information on data sources)


In [3]:
lcdemo_path = '<path to the root dir of images and annotations>'  # Lung cancer medical images and radiologist annotations
caslib_name = 'lcdemo'
lcdemo_subdir = ''
slides_dir = '<relative path to slides images>'  # Slides

Helper Functions


In [4]:
def reverse(a, axis=0):
    """Reverses an numpy array along a given axis."""
    idx = [slice(None)] * len(a.shape)
    idx[axis] = slice(None, None, -1)
    return a[idx]

def display_slide(binaries, paths, width, height, n):
    """Displays nth slide."""
    slide_path = lcdemo_path + slides_dir + ('Slide%s.PNG' % n)
    pos = np.where(paths == slide_path)[0][0]
    slide_image = bytearray(binaries[pos])
    slide_image = np.reshape(slide_image, (height, width, 4))[:, :, 0:3]
    slide_image = reverse(slide_image, 2)
    plt.figure(figsize = (15, 30))
    plt.imshow(slide_image)
    plt.show()

def display_image_pair(image1, image2):
    """Displays a pair of images."""
    fig = plt.figure(figsize=(10, 10. / image1.shape[1] * image1.shape[0]))
    fig.add_subplot(121)
    plt.imshow(np.squeeze(image1))
    plt.gray()
    plt.xticks([0, image1.shape[1]], fontsize=15)
    plt.yticks([0, image1.shape[0]], fontsize=15)
    fig.add_subplot(122)
    plt.imshow(np.squeeze(image2))
    plt.gray()
    plt.xticks([0, image2.shape[1]], fontsize=15)
    plt.yticks([0, image2.shape[0]], fontsize=15)
    plt.show()

def display_image_pair_from_table(table, channels, where1, where2):
    """Fetch two images from a CAS table according to provided where-clauses and display them."""
    # Fetch image binaries, and related data
    fetched_images = table.query(where1)[['_resolution_', '_image_']]\
                                .sort_values('_path_').to_frame()
    image_resolutions = fetched_images['_resolution_']
    image_binaries = fetched_images['_image_']
    res = np.fromstring(image_resolutions[0], np.int64)
    image1 = bytearray(image_binaries[0][0:channels * res[0] * res[1]])
    image1 = np.reshape(image1, (res[1], res[0], channels))
    
    fetched_images = table.query(where2)[['_resolution_', '_image_']]\
                                .sort_values('_path_').to_frame()
    image_resolutions = fetched_images['_resolution_']
    image_binaries = fetched_images['_image_']
    res = np.fromstring(image_resolutions[0], np.int64)
    image2 = bytearray(image_binaries[0][0:channels * res[0] * res[1]])
    image2 = np.reshape(image2, (res[1], res[0], channels))
    display_image_pair(image1, image2)

def get_information(directory):
    """Get some information about files in a directory."""
    file_list = []
    for i in os.listdir(directory):
        a = os.stat(os.path.join(directory,i))
        file_list.append([i,time.ctime(a.st_mtime),time.ctime(a.st_ctime)])
    return file_list

def find_optimal_cutoff(target, predicted, plabel):
    """Find optimal cut-off based on ROC analyses."""
    fpr, tpr, threshold = metrics.roc_curve(target, predicted, pos_label=plabel)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(-tpr - (1 - fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.ix[(roc.tf).argsort()[:1]]
    return list(roc_t['threshold']), fpr, tpr, threshold

def sort_label_ids_by_position(ids, paths, lbls, poss, ors, label):
    """Sort labels and ids of images according to their position."""
    inds = np.array(lbls == label)
    ids = np.array(ids)[inds]
    paths = np.array(paths)[inds]
    poss = np.array(poss)[inds]
    ors = np.array(ors)[inds]
    poss = np.array([struct.unpack('=3d', poss[i]) for i in range(poss.size)])
    ors = np.array([struct.unpack('=9d', ors[i]) for i in range(ors.size)])
    zdir = ors[0].reshape(3, 3)[:, 2]
    pros = np.array([np.dot(poss[i], zdir) for i in range(len(poss))])
    sinds = np.argsort(pros)[::-1]
    return ids[sinds], paths[sinds]

def get_image_array(image_binaries, dimensions, resolutions, formats, n):
    """Get a 3D image from CAS table."""
    dimension = dimensions[n]
    resolution = np.array(struct.unpack('=%sq' % dimension, resolutions[n][0:dimension * 8]))
    resolution = resolution[::-1]
    myformat = formats[n]
    num_cells = np.prod(resolution)
    if myformat == '16U' or myformat == '16S':
        image_array = np.array(struct.unpack('=%sh' % num_cells, image_binaries[n][0:2 * num_cells]))
        image_array = np.reshape(image_array, resolution)
    elif myformat == '8U' or myformat == '8S':
        image_array = np.array(bytearray(image_binaries[n][0:num_cells]))
        image_array = np.reshape(image_array, resolution)
    else:
        image_array = np.array(bytearray(image_binaries[n]))
        image_array = np.reshape(image_array, (resolution[0], resolution[1], 3))
        image_array = reverse(image_array, 2)
    return image_array

def mapping(val):
    """A simple mapping from int to int."""
    if (val == 0):
        return 2
    elif (val == 2):
        return 0
    else:
        return val

def display_slice_3d(images, dims, ress, fmts, poss, oris, scas, perm, image_index, slice_index, rf):
    """Display an image slice in 3D."""
    image = get_image_array(images, dims, ress, fmts, image_index)
    geo_perm = np.zeros(3, dtype=np.int)
    for i in range(3):
        geo_perm[mapping(i)] = mapping(perm[i])
    image = np.transpose(image, perm)
    image = image[slice_index, :, :]
    nr, nc = image.shape[:2]
    dimension = dims[image_index]
    pos = np.array(struct.unpack('=%sd' % dimension, poss[image_index]))
    sca = np.array(struct.unpack('=%sd' % dimension, scas[image_index][0:8 * dimension]))
    ori = np.array(struct.unpack('=%sd' % (dimension*dimension), oris[image_index][0:8 * dimension * dimension]))
    xx, yy = np.meshgrid(np.linspace(0, nc, nc), np.linspace(0, nr, nr))
    zz = np.zeros((nr, nc))
    lc = np.vstack((np.reshape(xx, (1, nc*nr)), np.reshape(yy, (1, nc*nr)), np.reshape(zz, (1, nc*nr))))
    ori = np.reshape(ori, (3, 3))
    ori = ori[:, geo_perm]
    sca = sca[geo_perm]
    pos = pos + slice_index * sca[2] * ori[:, 2]
    pos = np.reshape(pos, (3, 1))
    sca = np.diag(sca)
    gc = np.matmul(ori, np.matmul(sca, lc))
    gc = gc + np.matmul(pos, np.ones((1, nc*nr)))
    mlab.mesh(np.reshape(gc[0, :], (nr, nc)), np.reshape(gc[1, :], (nr, nc)), np.reshape(gc[2, :], (nr, nc)), 
              scalars = image, colormap='gray')
    if (rf):
        for i in range(3):
            clr=((i == 0) * 1, (i == 1) * 1, (i == 2) * 1)
            mlab.quiver3d(pos[0], pos[1], pos[2], ori[0, i], ori[1, i], ori[2, i], 
                          line_width=5, scale_factor=50*sca[i, i], color=clr, mode='arrow')

def load_slides(cas_session, slides_table):
    """Load the slides."""
    casout = s.CASTable(slides_table, replace=True)
    s.image.loadimages(path=slides_dir, caslib=caslib_name, casout=casout, decode=True)
    fetched_slides = casout[['_resolution_', '_image_', '_path_']].to_frame()
    slide_binaries = fetched_slides['_image_']
    slide_resolutions = fetched_slides['_resolution_']
    slide_paths = fetched_slides['_path_']
    res = np.fromstring(slide_resolutions[0], np.int64)
    slide_width = res[0]
    slide_height = res[1]
    return slide_binaries, slide_paths, slide_width, slide_height

Connect to CAS Server


In [6]:
s = swat.CAS(cashost, casport)

s.loadactionset('image')
s.loadactionset('biomedimage')
s.loadactionset('deeplearn')

s.addcaslib(name=caslib_name, path=lcdemo_path, subdirectories=True, datasource='path')
s.setsessopt(caslib=caslib_name, logflushtime=0)

if s.about()['About']['System']['OS Family'] == 'WIN':
    slash='\\'
else:
    slash='/'


NOTE: Added action set 'image'.
NOTE: Added action set 'biomedimage'.
NOTE: Added action set 'deeplearn'.
NOTE: 'lcdemo' is now the active caslib.
NOTE: Cloud Analytic Services added the caslib 'lcdemo'.
NOTE: 'lcdemo' is now the active caslib.
NOTE: Grid node action status report: 1 nodes, 23 total actions executed.

(Prerun) Load Slides


In [7]:
slide_binaries, slide_paths, slide_width, slide_height = load_slides(s, 'slides')
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 1)


NOTE: Loaded 18 images from \\bbnfs01.unx.sas.com\ifs\fvcc\BMIData\DemoData\LungNoduleClassification\sgf2018\ into Cloud Analytic Services table slides.

In [8]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 2)



In [9]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 3)



In [10]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 4)



In [11]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 5)



In [12]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 6)



In [13]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 7)


(Prerun) Load All Patient 3D Images & Metadata (DICOM)


In [14]:
orig_medical = s.CASTable('origMedical', replace=True)

s.image.loadimages(path='TrainDataOriginal',
                   caslib=caslib_name,
                   casout=orig_medical,
                   addcolumns=dict(general=['width', 'height', 'depth', 'channeltype', 'position', 'orientation', 'spacing'],
                                   dicomAttributes=dict(keywords=['PatientAge', 'PatientSex', 'AcquisitionDate'])),
                   recurse=True,
                   series=dict(dicom=True),
                   labellevels=1,
                   decode=True)


NOTE: Loaded 10 images from \\bbnfs01.unx.sas.com\ifs\fvcc\BMIData\DemoData\LungNoduleClassification\TrainDataOriginal into Cloud Analytic Services table origMedical.
Out[14]:

elapsed 197s · user 25s · sys 18s · mem 1.78e+03MB

Fetch and Display Example Patient Image with Malignant Lung Nodule

(Prerun) Fetch


In [15]:
pid = "CT-Training-LC009"
example_rows = orig_medical.query("_label_='"+pid+"'").to_frame()

display(example_rows[['_PatientAge_', '_PatientSex_', '_AcquisitionDate_', '_width_', '_height_', '_depth_']])

medical_dimensions = example_rows['_dimension_']
medical_formats = example_rows['_channelType_']
medical_binaries = example_rows['_image_']
medical_resolutions = example_rows['_resolution_']
medical_channel_types = example_rows['_channelType_']
medical_image_array_exp = get_image_array(medical_binaries, medical_dimensions, medical_resolutions, medical_formats, 0)


Selected Rows from Table ORIGMEDICAL
_PatientAge_ _PatientSex_ _AcquisitionDate_ _width_ _height_ _depth_
0 063Y F 20070103 512 512 281

Display


In [16]:
sf = mlab.pipeline.scalar_field(medical_image_array_exp)
mlab.pipeline.image_plane_widget(sf, plane_orientation='x_axes', slice_index=50, colormap='gray')
mlab.pipeline.image_plane_widget(sf, plane_orientation='y_axes', slice_index=100, colormap='gray')
mlab.pipeline.image_plane_widget(sf, plane_orientation='z_axes', slice_index=100, colormap='gray')
mlab.show()

Load & Prepare Radiologist Annotations of All Patients

Load, Fetch, & display


In [17]:
train_labels = s.CASTable('trainlabels', replace=True)

s.load_path('%s%s' % (lcdemo_subdir, 'trainAnnotations.csv'),
            importoptions=dict(filetype='csv', getnames=True),
            casout=train_labels)

fetched_labels = train_labels.sort_values('PatientID').to_frame()
n_patients = len(fetched_labels)
display(fetched_labels)

# Extract the Annotations for Later Use
patient_ids = fetched_labels['PatientID']
X = fetched_labels['NoduleX']
Y = fetched_labels['NoduleY']
W = fetched_labels['NoduleW']
H = fetched_labels['NoduleH']
Z_orig = fetched_labels['NoduleZ'].copy()
Z = fetched_labels['NoduleZ'].copy()
diagnosis = fetched_labels['Diagnosis']

# Adjust Z-coordinates of Nodules for Image Depth
fetched_rows = orig_medical[['_depth_', '_label_']].sort_values('_label_').to_frame()

medical_depths = fetched_rows["_depth_"]
for psn in range(n_patients):
    Z[psn] = medical_depths[psn]-Z_orig[psn]


Selected Rows from Table TRAINLABELS
PatientID NoduleX NoduleY NoduleZ NoduleW NoduleH Diagnosis
0 CT-Training-BE001 396.0 288.0 169.0 12.0 12.0 benign
1 CT-Training-BE007 371.0 190.0 194.0 29.0 32.0 benign
2 CT-Training-LC002 132.0 352.0 70.0 14.0 14.0 malignant
3 CT-Training-LC003 365.0 314.0 70.0 19.0 19.0 malignant
4 CT-Training-LC009 129.0 279.0 63.0 39.0 43.0 malignant
5 LUNGx-CT002 311.0 328.0 205.0 37.0 37.0 benign
6 LUNGx-CT003 359.0 359.0 146.0 31.0 31.0 malignant
7 LUNGx-CT009 165.0 200.0 164.0 19.0 19.0 benign
8 LUNGx-CT019 114.0 345.0 131.0 36.0 36.0 malignant
9 LUNGx-CT024 97.0 274.0 197.0 20.0 20.0 benign

In [18]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 8)



In [19]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 9)


(Prerun) Extract & Save 3D Patches Containing Nodules


In [20]:
s.setsessopt(messageLevel='NONE')
delta_z = 5
nodule_region = s.CASTable('noduleRegion', replace=True)

print('Saving 3D nodule patch (' + str(2*delta_z+1) + ' slices) of ...', flush=True)

for psn in range(n_patients):
    print(patient_ids[psn])
    
    # Extract the nodule region from the patient image into a temporary CAS table
    wclause = "_label_='" + patient_ids[psn] + "'"
    res=s.biomedimage.processbiomedimages(images=dict(table=orig_medical.query(wclause)),
                                          steps=[dict(stepparameters=dict(steptype='crop', 
                                                                          cropparameters=dict(croptype='basic', 
                                                                                              imagesize=[W[psn], 
                                                                                                         H[psn], 
                                                                                                         2*delta_z+1],
                                                                                              pixelIndex=[X[psn], 
                                                                                                          Y[psn], 
                                                                                                          Z[psn]-delta_z]))),
                                                 dict(stepparameters=dict(steptype='rescale', 
                                                                          rescaleparameters=dict(rescaletype='channeltype_8u')))],
                                          decode=True,
                                          copyVars=["_label_", "_path_", "_type_"],
                                          addColumns=["position", "orientation", "spacing"],
                                          casOut=nodule_region)

    # Save the nodule region as 3D images with metadata
    res=s.image.saveimages(images=dict(table=nodule_region, path='_path_'),
                           caslib=caslib_name, 
                           subdirectory=lcdemo_subdir + 'TrainDataNoduleRegions/' + diagnosis[psn],
                           type='nii',
                           labellevels=1)
    
print('All done', flush=True)


Saving 3D nodule patch (11 slices) of ...
CT-Training-BE001
CT-Training-BE007
CT-Training-LC002
CT-Training-LC003
CT-Training-LC009
LUNGx-CT002
LUNGx-CT003
LUNGx-CT009
LUNGx-CT019
LUNGx-CT024
All done

(Prerun) Load All 3D Nodule Patches. Display Examples in 2D.


In [21]:
nodules_3d = s.CASTable(name='nodules3D', replace=True)

r = s.image.loadimages(casout=nodules_3d,
                       path='TrainDataNoduleRegions',
                       caslib=caslib_name,
                       recurse=True,
                       addcolumns=["position", "orientation", "spacing", "channeltype"],
                       labellevels=2,
                       decode=True)

display_image_pair_from_table(nodules_3d, 1, 
                              "_label_='benign" + slash + "CT-Training-BE007'", 
                              "_label_='malignant" + slash + "CT-Training-LC009'")



In [22]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 10)



In [23]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 11)


Segment Nodules from the 3D Patches


In [24]:
cvars = ['_diagnosis_', '_pid_']
cvarspgm = "_diagnosis_=scan(_label_, 1, '" + slash + "'); _pid_=scan(_label_, 2, '" + slash + "')"
nodules_3d.append_computed_columns(cvars, cvarspgm)

masks_3d = s.CASTable(name='masks3D', replace=True)

res=s.biomedimage.segmentbiomedimages(images=dict(table=nodules_3d),
                                      method=dict(name='kmeans', kmeansparameters=dict(kmeanstype='basic',
                                                                                       regions=2)),
                                      decode = True,
                                      copyvars=["_diagnosis_", "_pid_", "_path_", "_id_"],
                                      addcolumns=["position", "orientation", "spacing"],
                                      casOut=masks_3d)

# Display
display_image_pair_from_table(masks_3d, 1, "_pid_='CT-Training-BE007'", "_pid_='CT-Training-LC009'")


Visualize the Segmentations in 3D

(Prerun) Build & Fetch Surfaces of 2 Nodules for Visualization


In [25]:
s.setsessopt(messageLevel='NOTE')

nodule_vertices = s.CASTable(name='noduleVertices', replace=True)
nodule_faces = s.CASTable(name='noduleFaces', replace=True)

surf_summary = s.biomedimage.buildsurface(images=dict(table=masks_3d.query("_pid_='CT-Training-BE007' or _pid_='CT-Training-LC009'")),
                                          intensities=[1],
                                          smoothing=dict(iterations=3),
                                          outputvertices=nodule_vertices,
                                          outputfaces=nodule_faces)
display(surf_summary)

surf_ids = surf_summary.Surfaces['Surface Identifier']
print('Fetching the surfaces ...', flush=True)
fetched_vertices = nodule_vertices.query('_surfaceId_=' + str(surf_ids[0]) + ' or ' + '_surfaceId_=' + str(surf_ids[1]))\
                                  .sort_values('_id_').to_frame()
fetched_faces = nodule_faces.to_frame()

if surf_summary.Surfaces['Number of Faces'][0] > surf_summary.Surfaces['Number of Faces'][1]:
    ben_surf_id=1
    mal_surf_id=0
else:
    ben_surf_id=0
    mal_surf_id=1


NOTE: Processed 2 images from Cloud Analytic Services table MASKS3D.
§ Surfaces
Surface Identifier Image Identifier Low Threshold High Threshold Number of Vertices Number of Faces
0 10 5 1.0 1.0 7087 12766
1 29 7 1.0 1.0 2479 4382

elapsed 2.65s · user 2.45s · sys 0.0312s · mem 9.96MB

Fetching the surfaces ...

Display routine


In [26]:
def display_nodulesurf_with_slice(fetched_vertices, fetched_faces, medical_dimensions, medical_formats,
                                  medical_binaries, medical_resolutions, medical_channelTypes, medical_spacings,
                                  medical_positions, medical_orientations, surf_id, pid, Z, patient_ids, rgb):
    print('Visualizing the surfaces ...', flush=True)

    vertices = fetched_vertices[fetched_vertices._surfaceId_ == surf_ids[surf_id]]
    faces = fetched_faces[fetched_faces._surfaceId_ == surf_ids[surf_id]]
    x = vertices.ix[:, ["_x_"]]
    y = vertices.ix[:, ["_y_"]]
    z = vertices.ix[:, ["_z_"]]
    flist = faces.ix[:, ["_v1_", "_v2_", "_v3_"]]
    op = 0.75
    mlab.triangular_mesh(x, y, z, flist, color=tuple([c/255 for c in rgb]), opacity=op)
    
    # Display the slices in the patient coordinate system
    print('Displaying the slices in the patient coordinate system ...', flush=True)
    display_slice_3d(medical_binaries, medical_dimensions, medical_resolutions, medical_formats,
                     medical_positions, medical_orientations, medical_spacings, (0, 1, 2), 0, int(Z[patient_ids==pid]), 1)
    mlab.show()

Visualize a Benign Nodule With Original Image Slice

(Prerun) Fetch the Slice


In [27]:
# Fetch an example slice for the patient
pid_benign = 'CT-Training-BE007'
print('Fetching the image of ' + pid_benign + ' ...', flush=True)
fetched_rows_benign = orig_medical.query("_label_='" + pid_benign + "'").to_frame()
medical_dimensions_benign = fetched_rows_benign["_dimension_"]
medical_formats_benign = fetched_rows_benign["_channelType_"]
medical_binaries_benign = fetched_rows_benign["_image_"]
medical_resolutions_benign = fetched_rows_benign["_resolution_"]
medical_channelTypes_benign = fetched_rows_benign["_channelType_"]
medical_spacings_benign = fetched_rows_benign["_spacing_"]
medical_positions_benign = fetched_rows_benign["_position_"]
medical_orientations_benign = fetched_rows_benign["_orientation_"]


Fetching the image of CT-Training-BE007 ...

Display


In [28]:
display_nodulesurf_with_slice(fetched_vertices, fetched_faces, medical_dimensions_benign, medical_formats_benign,
                              medical_binaries_benign, medical_resolutions_benign, medical_channelTypes_benign, 
                              medical_spacings_benign, medical_positions_benign, medical_orientations_benign, 
                              ben_surf_id, pid_benign, Z, patient_ids, (0, 255, 0))


Visualizing the surfaces ...
Displaying the slices in the patient coordinate system ...

Visualize a Malignant Nodule Along with Original Slice

(Prerun) Fetch


In [29]:
# Fetch an example slice for the patient
pid_malignant = 'CT-Training-LC009'
print('Fetching the image of ' + pid_malignant + ' ...', flush=True)
fetched_rows_malignant = orig_medical.query("_label_='" + pid_malignant + "'").to_frame()
medical_dimensions_malignant = fetched_rows_malignant["_dimension_"]
medical_formats_malignant = fetched_rows_malignant["_channelType_"]
medical_binaries_malignant = fetched_rows_malignant["_image_"]
medical_resolutions_malignant = fetched_rows_malignant["_resolution_"]
medical_channelTypes_malignant = fetched_rows_malignant["_channelType_"]
medical_spacings_malignant = fetched_rows_malignant["_spacing_"]
medical_positions_malignant = fetched_rows_malignant["_position_"]
medical_orientations_malignant = fetched_rows_malignant["_orientation_"]


Fetching the image of CT-Training-LC009 ...

Display


In [30]:
display_nodulesurf_with_slice(fetched_vertices, fetched_faces, medical_dimensions_malignant, medical_formats_malignant,
                              medical_binaries_malignant, medical_resolutions_malignant, medical_channelTypes_malignant, 
                              medical_spacings_malignant, medical_positions_malignant, medical_orientations_malignant, 
                              mal_surf_id, pid_malignant, Z, patient_ids, (255, 0, 0))


Visualizing the surfaces ...
Displaying the slices in the patient coordinate system ...

Export 3D Nodule Masks as 2D Slices


In [31]:
cvars = ['_label_']
cvarspgm = "length _label_ varchar(*); _label_=_pid_;"
masks_3d.append_computed_columns(cvars, cvarspgm)

masks = s.CASTable(name='masks', replace=True)

res=s.biomedimage.processbiomedimages(images=dict(table=masks_3d),
                                      steps=[dict(stepparameters=dict(steptype='export'))],
                                      decode = True,
                                      copyvars={"_label_"},
                                      casout = masks)


NOTE: Processed 10 images from Cloud Analytic Services table MASKS3D.

Resample All Patches to 32x32


In [32]:
common_w = 32
common_h = 32
cvars = ['_path_']
cvarspgm = "length _path_ varchar(*); _path_=put(_bioMedId_*1000+_sliceIndex_, 5.);"
masks_in = s.CASTable(name='masks')
masks_in.append_computed_columns(cvars, cvarspgm)

masks_scaled = s.CASTable(name='masksScaled', replace=True)

s.image.processimages(imagetable=masks_in, 
                      casout=masks_scaled,
                      imagefunctions=[dict(functionoptions=dict(functiontype="resize", width=common_w, height=common_h)),
                                      dict(functionoptions=dict(functiontype="threshold", type="binary", value=0))],
                      decode=True)

# Display
display_image_pair_from_table(masks_scaled, 1, "_label_='CT-Training-BE007'", "_label_='CT-Training-LC009'")


NOTE: Table MASKS contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved as decompressed images to the Cloud Analytic Services table masksScaled.

In [33]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 12)


Morphological Operations


In [34]:
masks_scaled_in = s.CASTable('masksScaled')
masks_final = s.CASTable(name='masksFinal', replace=True)

s.image.processimages(imagetable=masks_scaled_in, 
                      casout=masks_final,
                      imagefunctions=[dict(functionoptions=dict(functiontype="morphology", method="erode",
                                                                kernelwidth=3, Kernelheight=3)),
                                      dict(functionOptions=dict(functionType="morphology", method="erode",
                                                                kernelwidth=3, Kernelheight=3)),
                                      dict(functionOptions=dict(functionType="morphology", method="dilate",
                                                                kernelwidth=3, Kernelheight=3)),
                                      dict(functionOptions=dict(functionType="morphology", method="dilate",
                                                                kernelwidth=3, Kernelheight=3)),],
                      decode=True)

# Display
display_image_pair_from_table(masks_scaled, 1, "_label_='CT-Training-BE007'", "_label_='CT-Training-LC009'")
display_image_pair_from_table(masks_final, 1, "_label_='CT-Training-BE007'", "_label_='CT-Training-LC009'")


NOTE: Table MASKSSCALED contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved as decompressed images to the Cloud Analytic Services table masksFinal.

Convert Before & After Images to Wide Format


In [35]:
# Convert before & after images to color
masks_scaled_color = s.CASTable(name='masksScaledColor', replace=True)
masks_scaled_color_in = s.CASTable(name='masksScaledColor')
s.image.processimages(imagetable=masks_scaled_in, 
                      casout=masks_scaled_color,
                      imagefunctions=[dict(functionoptions=dict(functiontype="convert_color", type="gray2color"))],
                      decode=True)

masks_final_in = s.CASTable('masksFinal')
masks_final_color = s.CASTable(name='masksFinalColor', replace=True)
masks_final_color_in = s.CASTable(name='masksFinalColor')
s.image.processImages(imagetable=masks_final_in, 
                      casout=masks_final_color,
                      imagefunctions=[dict(functionoptions=dict(functiontype="convert_color", type="gray2color"))],
                      decode=True)

# Convert before & after images to wide format
masks_scaled_flat = s.CASTable('masksScaledFlat', replace=True)
s.image.flattenimagetable(imagetable=masks_scaled_color_in,
                          casout=masks_scaled_flat,
                          width=common_w,
                          height=common_h)
masks_final_flat = s.CASTable('masksFinalFlat', replace=True)
s.image.flattenimagetable(imagetable=masks_final_color_in,
                          casout=masks_final_flat,
                          width=common_w,
                          height=common_h)


# Fetch sample rows & display
fetched_flat = masks_scaled_flat.head(3)
display(HTML("<h3>BEFORE morphological operations</h3>"))
midc = 'c1100'
display(fetched_flat[['_path_', 'c1', 'c2', 'c3', midc, 'c3072', '_label_']])
fetched_flat = masks_final_flat.head(3)
display(HTML("<h3>AFTER morphological operations</h3>"))
display(fetched_flat[['_path_', 'c1', 'c2', 'c3', midc, 'c3072', '_label_']])


NOTE: Table MASKSSCALED contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved as decompressed images to the Cloud Analytic Services table masksScaledColor.
NOTE: Table MASKSFINAL contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved as decompressed images to the Cloud Analytic Services table masksFinalColor.
NOTE: Table MASKSSCALEDCOLOR contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved to the Cloud Analytic Services table masksScaledFlat.
NOTE: Table MASKSFINALCOLOR contains decompressed images.
NOTE: 110 out of 110 images were processed successfully and saved to the Cloud Analytic Services table masksFinalFlat.

BEFORE morphological operations

Selected Rows from Table MASKSSCALEDFLAT
_path_ c1 c2 c3 c1100 c3072 _label_
0 1000 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003
1 1001 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003
2 1002 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003

AFTER morphological operations

Selected Rows from Table MASKSFINALFLAT
_path_ c1 c2 c3 c1100 c3072 _label_
0 1000 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003
1 1001 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003
2 1002 0.0 0.0 0.0 0.0 0.0 CT-Training-LC003

Add Up the White Pixels in All Images & Fetch the Results


In [36]:
# Create the program for sum of all columns
pgm = "nz = c1";
for num in range(2, common_w*common_h*3 + 1):
    pgm += "+c" + str(num)
    

# Fetch the tables with #white pixels in each of the slices computed
masks_scaled_flat.append_computed_columns("nz", pgm)
scaled_sum = masks_scaled_flat[['_path_', '_label_', 'nz']].to_frame()
masks_final_flat.append_computed_columns("nz", pgm)
final_sum = masks_final_flat[['_path_', 'nz']].to_frame()

# Fetch sample rows to the client side & display
display(HTML("<h3>BEFORE morphological operations</h3>"))
display(scaled_sum[['_path_', 'nz', '_label_']][0:3])
display(HTML("<h3>AFTER morphological operations</h3>"))
display(final_sum[['_path_', 'nz']][0:3])


BEFORE morphological operations

Selected Rows from Table MASKSSCALEDFLAT
_path_ nz _label_
0 1000 10710.0 CT-Training-LC003
1 1001 39015.0 CT-Training-LC003
2 1002 91800.0 CT-Training-LC003

AFTER morphological operations

Selected Rows from Table MASKSFINALFLAT
_path_ nz
0 1000 0.0
1 1001 0.0
2 1002 70380.0

Compute Degree of Spiculation: Relative Amount of Pixels "Lost" During Morphological Operations


In [37]:
# Join the 2 tables on file path
joined_sum = pd.merge(scaled_sum, final_sum, on='_path_', suffixes=['s', 'f'])
good = joined_sum['nzs']!=0
joined_sum = pd.merge(joined_sum, fetched_labels[['PatientID', 'Diagnosis']], left_on='_label_', right_on='PatientID')
good = joined_sum['nzs']!=0
fetched_spiculations = (joined_sum['nzs'][good]-joined_sum['nzf'][good])/joined_sum['nzs'][good]
fetched_diagnosis = joined_sum['Diagnosis'][good]
plot_data = np.column_stack((fetched_spiculations, fetched_diagnosis.T))
print(plot_data[0:10])
n_images = len(plot_data)


[[1.0 'malignant']
 [1.0 'malignant']
 [0.23333333333333334 'malignant']
 [0.19649122807017544 'malignant']
 [0.14136125654450263 'malignant']
 [0.10182767624020887 'malignant']
 [0.0633245382585752 'malignant']
 [0.1242603550295858 'malignant']
 [0.12444444444444444 'malignant']
 [0.40425531914893614 'malignant']]

Plot the Distribution of Degree of Spiculation for Each Class


In [38]:
# Plot histograms for spiculation for each label
fig, ax = plt.subplots(figsize=(15, 6))
bar_width = 0.7
opacity = 0.5
bins = np.linspace(0, 1, 40)
inds = fetched_diagnosis == 'benign'
plt.hist(fetched_spiculations[inds], bins, alpha=opacity, label='Benign', color='g')
inds = fetched_diagnosis == 'malignant'
plt.hist(fetched_spiculations[inds], bins, alpha=opacity, label='Malignant', color='r')
matplotlib.rcParams.update({'font.size': 22})
plt.xlabel('Degree of spiculation')
plt.ylabel('Frequency')
plt.legend()
plt.show()


Plot ROC Curve for the Metric & Calculate Optimal Threshold


In [39]:
optt, fpr, tpr, thresholds = find_optimal_cutoff(fetched_diagnosis, fetched_spiculations, 'malignant')
fig, ax = plt.subplots(figsize=(15, 6))
plt.plot(fpr, tpr, color='#800080', linewidth=3.0)
plt.plot([fpr[i] for i in range(len(thresholds)) if thresholds[i] == optt[0]], 
         [tpr[i] for i in range(len(thresholds)) if thresholds[i] == optt[0]],
         marker='o', color='#FF8C00', markersize=10)
matplotlib.rcParams.update({'font.size': 22})
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()
display(HTML("<h3>Optimal spiculation cut off: %s" % optt[0] + "<h3>"))


Optimal spiculation cut off: 0.0945945945946

Assess the Metric Accuracy With k-Fold Cross Validation


In [40]:
n_folds = 10
kf = model_selection.KFold(n_splits=n_folds)

acc=[]
for train_index, test_index in kf.split(fetched_spiculations):
    ot, f, t, ts = find_optimal_cutoff(fetched_diagnosis[train_index], fetched_spiculations[train_index], 'malignant')
    acc.extend([sum(1 for i in test_index if (fetched_spiculations[i] >  ot and fetched_diagnosis[i] == 'malignant') or
                                             (fetched_spiculations[i] <= ot and fetched_diagnosis[i] == 'benign')) / len(test_index) * 100])

display(HTML("<h3>Classification accuracy based on %s -fold cross-validation: %s" % (n_folds, np.mean(acc)) + "<h3>"))


Classification accuracy based on 10 -fold cross-validation: 84.5454545455


In [41]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 13)



In [42]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 14)



In [43]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 15)


Resample 3D Nodule Patches and Export Them as 2D Slices


In [44]:
cvars = ['_diagnosis_', '_pid_']
cvarspgm = "_diagnosis_=SCAN(_label_, 1, '" + slash + "'); _pid_=SCAN(_label_, 2, '" + slash + "')"
nodules_3d.append_computed_columns(cvars, cvarspgm)

nodules_2d = s.CASTable(name='nodules2D', replace=True)

res = s.biomedimage.processbiomedimages(images=dict(table=nodules_3d),
                                        steps=[dict(stepparameters=dict(steptype='resample', 
                                                                        resampleparameters=dict(resampletype='size', 
                                                                                                width=common_w, 
                                                                                                height=common_h))),
                                               dict(stepparameters=dict(steptype='export'))],
                                        decode=True,
                                        copyVars=["_diagnosis_", "_pid_", "_path_", "_id_"],
                                        casOut=nodules_2d)

display_image_pair_from_table(nodules_2d, 1, "_pid_='CT-Training-BE007'", "_pid_='CT-Training-LC009'")


NOTE: Processed 10 images from Cloud Analytic Services table NODULES3D.

Shuffle Images & Create Training & Testing Sets


In [45]:
cvars = ['_label_', 'shuffle_id']
cvarspgm = "length _label_ varchar(*); _label_=_diagnosis_; call streaminit(-1); shuffle_id=rand(\"UNIFORM\");"
pThreshold = '0.5'
nodules_2d.append_computed_columns(cvars, cvarspgm)

trainTbl0 = nodules_2d.query('shuffle_id>'+pThreshold).groupby('shuffle_id')
train = s.CASTable('train', replace=True)
r = s.partition(table=trainTbl0, casout=train)

validTbl0 = nodules_2d.query('shuffle_id<'+pThreshold).groupby('shuffle_id')
test = s.CASTable('test', replace=True)
r = s.partition(table=trainTbl0, casout=test)

Agument Training Data


In [46]:
train_in = s.CASTable('train')
train_aug = s.CASTable('trainAug', replace=True)
s.image.augmentImages(imageTable=train_in,
                      cropList=[{'useWholeImage': True,
                                 'mutations': {
                                 'verticalFlip': True,
                                 'horizontalFlip': True,
                                 'sharpen': True,
                                 'darken': True,
                                 'lighten': True,
                                 'colorJittering': True,
                                 'colorShifting': True,
                                 'rotateRight': True,
                                 'rotateLeft': True,
                                 'pyramidUp': True,
                                 'pyramidDown': True}}],
                      copyvars=['_pid_'],
                      casout=train_aug)

n_ids = 100
fetched_ids = train_aug[['_id_']].to_frame(to=n_ids+1)['_id_']
display_image_pair_from_table(train_aug, 1, "_id_="+str(fetched_ids[randint(0, n_ids)]), 
                                            "_id_="+str(fetched_ids[randint(0, n_ids)]))
display_image_pair_from_table(train_aug, 1, "_id_="+str(fetched_ids[randint(0, n_ids)]), 
                                            "_id_="+str(fetched_ids[randint(0, n_ids)]))


NOTE: Table TRAIN contains decompressed images.
NOTE: 648 images were produced successfully out of 54 input images and saved as decompressed images to the Cloud Analytic Services table trainAug.

In [47]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 16)


Build the CNN, the MicroNet


In [48]:
# Create the model tinyNet
micro_net = s.CASTable('microNet', replace=True)
micro_net_in = s.CASTable('microNet')
s.deepLearn.buildModel(model=micro_net, type='CNN')

# Input layer
s.deepLearn.addLayer(model=micro_net_in, name='images', 
                     layer=dict(type='input', nchannels=1, width=32, height=32))

# First Convolutional Layer
s.deepLearn.addLayer(model=micro_net_in, name='conv1',
                     layer=dict(type='convolution', nFilters=1, width=3, height=3, stride=1,
                     init='NORMAL', std=0.1, truncationfactor=2, act='RELU'),
                     srcLayers=['images'])

# First pooling layer
s.deepLearn.addLayer(model=micro_net_in, name='pool1',
                     layer=dict(type='pooling', width=3, height=3, stride=3, pool='max'),
                     srcLayers=['conv1'])
                      
# Second Convolutional Layer
s.deepLearn.addLayer(model=micro_net_in, name='conv2',
                     layer=dict(type='convolution', nFilters=2, width=3, height=3, stride=1,
                     init='NORMAL', std=0.1, truncationfactor=2, act='RELU'),
                     srcLayers=['pool1'])

# Second pooling layer
s.deepLearn.addLayer(model=micro_net_in, name='pool2',
                     layer=dict(type='pooling', width=2, height=2, stride=2, pool='max'),
                     srcLayers=['conv2'])
                      
# Fully connected layer
s.deepLearn.addLayer(model=micro_net_in, name='fc1', 
                     layer=dict(type='fullconnect', n=2, act='relu', init='NORMAL', std=0.1, truncationfactor=2),
                     srcLayers=['pool2'])

# Softmax output layer
r = s.deepLearn.addLayer(model=micro_net_in, name='outlayer', 
                         layer=dict(type='output', act='softmax'),
                         srcLayers=['fc1'])
r


Out[48]:
§ OutputCasTables
casLib Name Rows Columns casTable
0 lcdemo micronet 101 5 CASTable('micronet', caslib='lcdemo')

elapsed 0.00629s · mem 1.05MB

(Prerun) Train MicroNet


In [49]:
# Train the model (this took over an hour with the 10 workers)
s.setsessopt(messageLevel='ERROR')
train_r = s.deepLearn.dltrain(model=micro_net_in,
                              table=train_aug,
                              seed=99,
                              nthreads=8,
                              input=['_image_','_label_'],
                              target='_label_',
                              nominal=['_label_'],
                              modelweights=dict(name='weights', replace=True),
                              learningOpts=dict(miniBatchSize=1, maxEpochs=20, learningRate=0.001, aSyncFreq=1, algorithm='ADAM'))
s.setsessopt(messageLevel='NOTE')
display(HTML("<h3>Number of parameters in MicroNet: %s" %  str(train_r.ModelInfo['Value'][10] + "<h3>")))


Number of parameters in MicroNet: 182

Plot Training Loss/Fit


In [50]:
train_r.OptIterHistory[['Loss', 'FitError']].plot(figsize = (12, 5));


Test MicroNet


In [51]:
# Score against validation data
weights = s.CASTable('weights', replace=True)
scored = s.CASTable('scored', replace=True)
r = s.dlscore(model=micro_net_in,
              initWeights=weights,
              table=test,
              copyVars=['_label_', "_image_"],
              layerOut=dict(name='layerOut', replace=True),
              casout=scored)
display(HTML("<h3>Misclassification error: %s" %  str(r.ScoreInfo['Value'][2] + "<h3>")))


Misclassification error: 9.090909

Confusion Matrix


In [52]:
# build a confusion matrix 
cmr = s.crosstab(table=scored, row='_label_', col='_dl_predname_')
cmr


Out[52]:
§ Crosstab
_label_ Col1 Col2
0 benign 32.0 2.0
1 malignant 3.0 18.0

mem 0.825MB

False Positive Example


In [53]:
wc = "_label_='benign' and _dl_predname_='malignant'"
scored_images = scored.query(wc).to_frame()
if (len(scored_images)):
    image = np.array(bytearray(scored_images['_image_'][0][0:32*32]))
    image = np.reshape(image, [32, 32])
    plt.imshow(image)
    plt.show()
else:
    display(HTML("<h3> No false positives! <h3>"))


False Negative Example


In [54]:
wc = "_label_='malignant' and _dl_predname_='benign'"
scored_images = scored.query(wc).to_frame()
if (len(scored_images)):
    image = np.array(bytearray(scored_images['_image_'][0][0:32*32]))
    image = np.reshape(image, [32, 32])
    plt.imshow(image)
    plt.show()
else:
    display(HTML("<h3> No false negatives! <h3>"))



In [55]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 17)



In [56]:
display_slide(slide_binaries, slide_paths, slide_width, slide_height, 18)



In [ ]: