In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import SimpleITK as sitk
from PIL import Image
from scipy.misc import imread
%matplotlib inline
from IPython.display import clear_output
pd.options.mode.chained_assignment = None
In [2]:
annotations = pd.read_csv('../src/data/annotations.csv')
candidates = pd.read_csv('../src/data/candidates.csv')
In [3]:
annotations.head()
Out[3]:
In [4]:
candidates['class'].sum()
Out[4]:
In [5]:
len(annotations)
Out[5]:
In [6]:
candidates.info()
In [7]:
print len(candidates[candidates['class'] == 1])
print len(candidates[candidates['class'] == 0])
In [8]:
import multiprocessing
num_cores = multiprocessing.cpu_count()
print num_cores
The best way to move forward will be to undersample the negative class and then augment the positive class heaviliy to balance out the samples.
Get an initial subsample of negative class and keep all of the positives such that we have a 80/20 class distribution
Create a training set such that we augment minority class heavilby rotating to get a 50/50 class distribution
In [9]:
class CTScan(object):
def __init__(self, filename = None, coords = None):
self.filename = filename
self.coords = coords
self.ds = None
self.image = None
def reset_coords(self, coords):
self.coords = coords
def read_mhd_image(self):
path = glob.glob('../data/raw/*/'+ self.filename + '.mhd')
self.ds = sitk.ReadImage(path[0])
self.image = sitk.GetArrayFromImage(self.ds)
def get_resolution(self):
return self.ds.GetSpacing()
def get_origin(self):
return self.ds.GetOrigin()
def get_ds(self):
return self.ds
def get_voxel_coords(self):
origin = self.get_origin()
resolution = self.get_resolution()
voxel_coords = [np.absolute(self.coords[j]-origin[j])/resolution[j] \
for j in range(len(self.coords))]
return tuple(voxel_coords)
def get_image(self):
return self.image
def get_subimage(self, width):
self.read_mhd_image()
x, y, z = self.get_voxel_coords()
subImage = self.image[z, y-width/2:y+width/2, x-width/2:x+width/2]
return subImage
def normalizePlanes(self, npzarray):
maxHU = 400.
minHU = -1000.
npzarray = (npzarray - minHU) / (maxHU - minHU)
npzarray[npzarray>1] = 1.
npzarray[npzarray<0] = 0.
return npzarray
def save_image(self, filename, width):
image = self.get_subimage(width)
image = self.normalizePlanes(image)
Image.fromarray(image*255).convert('L').save(filename)
In [10]:
positives = candidates[candidates['class']==1].index
negatives = candidates[candidates['class']==0].index
In [11]:
scan = CTScan(np.asarray(candidates.iloc[negatives[600]])[0], \
np.asarray(candidates.iloc[negatives[600]])[1:-1])
scan.read_mhd_image()
x, y, z = scan.get_voxel_coords()
image = scan.get_image()
dx, dy, dz = scan.get_resolution()
x0, y0, z0 = scan.get_origin()
In [13]:
filename = '1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208'
coords = (70.19, -140.93, 877.68)#[877.68, -140.93, 70.19]
scan = CTScan(filename, coords)
scan.read_mhd_image()
x, y, z = scan.get_voxel_coords()
image = scan.get_image()
dx, dy, dz = scan.get_resolution()
x0, y0, z0 = scan.get_origin()
In [15]:
positives
Out[15]:
In [16]:
np.random.seed(42)
negIndexes = np.random.choice(negatives, len(positives)*5, replace = False)
In [17]:
candidatesDf = candidates.iloc[list(positives)+list(negIndexes)]
In [18]:
from sklearn.cross_validation import train_test_split
X = candidatesDf.iloc[:,:-1]
y = candidatesDf.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state = 42)
In [20]:
len(X_train)
Out[20]:
In [21]:
X_train.to_pickle('traindata')
X_test.to_pickle('testdata')
X_val.to_pickle('valdata')
In [22]:
def normalizePlanes(npzarray):
maxHU = 400.
minHU = -1000.
npzarray = (npzarray - minHU) / (maxHU - minHU)
npzarray[npzarray>1] = 1.
npzarray[npzarray<0] = 0.
return npzarray
In [23]:
print 'number of positive cases are ' + str(y_train.sum())
print 'total set size is ' + str(len(y_train))
print 'percentage of positive cases are ' + str(y_train.sum()*1.0/len(y_train))
In [24]:
tempDf = X_train[y_train == 1]
tempDf = tempDf.set_index(X_train[y_train == 1].index + 1000000)
X_train_new = X_train.append(tempDf)
tempDf = tempDf.set_index(X_train[y_train == 1].index + 2000000)
X_train_new = X_train_new.append(tempDf)
ytemp = y_train.reindex(X_train[y_train == 1].index + 1000000)
ytemp.loc[:] = 1
y_train_new = y_train.append(ytemp)
ytemp = y_train.reindex(X_train[y_train == 1].index + 2000000)
ytemp.loc[:] = 1
y_train_new = y_train_new.append(ytemp)
print len(X_train_new), len(y_train_new)
In [35]:
X_train_new.index
Out[35]:
In [26]:
from scipy.misc import imresize
from PIL import ImageEnhance
class PreProcessing(object):
def __init__(self, image = None):
self.image = image
def subtract_mean(self):
self.image = (self.image/255.0 - 0.25)*255
return self.image
def downsample_data(self):
self.image = imresize(self.image, size = (40, 40), interp='bilinear', mode='L')
return self.image
def enhance_contrast(self):
self.image = ImageEnhance.Contrast(self.image)
return self.image
In [36]:
dirName = '../src/data/train/'
plt.figure(figsize = (10,10))
inp = imread(dirName + 'image_'+ str(30517) + '.jpg')
plt.subplot(221)
plt.imshow(inp)
plt.grid(False)
Pp = PreProcessing(inp)
inp2 = Pp.subtract_mean()
plt.subplot(222)
plt.imshow(inp2)
plt.grid(False)
#inp4 = Pp.enhance_contrast()
#plt.subplot(224)
#plt.imshow(inp4)
#plt.grid(False)
inp3 = Pp.downsample_data()
plt.subplot(223)
plt.imshow(inp3)
plt.grid(False)
#inp4 = Pp.enhance_contrast()
#plt.subplot(224)
#plt.imshow(inp4)
#plt.grid(False)
In [ ]:
dirName
In [37]:
import tflearn
In [38]:
y_train_new.values.astype(int)
Out[38]:
In [44]:
train_filenames =\
X_train_new.index.to_series().apply(lambda x:\
'../src/data/train/image_'+str(x)+'.jpg')
train_filenames.values.astype(str)
Out[44]:
In [45]:
dataset_file = 'traindatalabels.txt'
train_filenames =\
X_train_new.index.to_series().apply(lambda x:\
filenames = train_filenames.values.astype(str)
labels = y_train_new.values.astype(int)
traindata = np.zeros(filenames.size,\
dtype=[('var1', 'S36'), ('var2', int)])
traindata['var1'] = filenames
traindata['var2'] = labels
np.savetxt(dataset_file, traindata, fmt="%10s %d")
In [48]:
# Build a HDF5 dataset (only required once)
from tflearn.data_utils import build_hdf5_image_dataset
build_hdf5_image_dataset(dataset_file, image_shape=(50, 50), mode='file', output_path='traindataset.h5', categorical_labels=True, normalize=True)
In [63]:
# Load HDF5 dataset
import h5py
h5f = h5py.File('traindataset.h5', 'r')
X_train_images = h5f['X']
Y_train_labels = h5f['Y']
h5f2 = h5py.File('../src/data/valdataset.h5', 'r')
X_val_images = h5f2['X']
Y_val_labels = h5f2['Y']
In [50]:
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
In [51]:
# Make sure the data is normalized
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()
In [52]:
# Create extra synthetic training data by flipping, rotating and blurring the
# images on our data set.
img_aug = ImageAugmentation()
img_aug.add_random_flip_leftright()
img_aug.add_random_rotation(max_angle=25.)
img_aug.add_random_blur(sigma_max=3.)
In [53]:
# Input is a 50x50 image with 1 color channels (grayscale)
network = input_data(shape=[None, 50, 50, 1],
data_preprocessing=img_prep,
data_augmentation=img_aug)
In [54]:
# Step 1: Convolution
network = conv_2d(network, 50, 3, activation='relu')
In [55]:
# Step 2: Max pooling
network = max_pool_2d(network, 2)
In [56]:
# Step 3: Convolution again
network = conv_2d(network, 64, 3, activation='relu')
In [57]:
# Step 4: Convolution yet again
network = conv_2d(network, 64, 3, activation='relu')
In [58]:
# Step 5: Max pooling again
network = max_pool_2d(network, 2)
In [59]:
# Step 6: Fully-connected 512 node neural network
network = fully_connected(network, 512, activation='relu')
In [60]:
# Step 7: Dropout - throw away some data randomly during training to prevent over-fitting
network = dropout(network, 0.5)
In [61]:
# Step 8: Fully-connected neural network with two outputs (0=isn't a nodule, 1=is a nodule) to make the final prediction
network = fully_connected(network, 2, activation='softmax')
In [62]:
# Tell tflearn how we want to train the network
network = regression(network, optimizer='adam',
loss='categorical_crossentropy',
learning_rate=0.001)
In [65]:
# Wrap the network in a model object
model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path='nodule-classifier.tfl.ckpt')
# Train it! We'll do 100 training passes and monitor it as it goes.
model.fit(X_train_images, Y_train_labels, n_epoch=100, shuffle=True, validation_set=(X_val_images, Y_val_labels),
show_metric=True, batch_size=96,
snapshot_epoch=True,
run_id='nodule-classifier')
# Save model when training is complete to a file
model.save("nodule-classifier.tfl")
print("Network trained and saved as nodule-classifier.tfl!")
In [68]:
h5f2 = h5py.File('../src/data/testdataset.h5', 'r')
X_test_images = h5f2['X']
Y_test_labels = h5f2['Y']
In [69]:
model.predict(X_test_images)
In [ ]: