Lung cancer is the leading cause of cancer-related death worldwide. Screening high risk individuals for lung cancer with low-dose CT scans is now being implemented in the United States and other countries are expected to follow soon. In CT lung cancer screening, many millions of CT scans will have to be analyzed, which is an enormous burden for radiologists. Therefore there is a lot of interest to develop computer algorithms to optimize screening.?
The upcoming high-profile?Coding4Cancer?challenge invites coders to create the best computer algorithm that can identify a person as having lung cancer based on one or multiple low-dose CT images.
To be able to solve the Coding4Cancer challenge, and detect lung cancer in an early stage, pulmonary nodules, the early manifestation of lung cancers, have to be located. Many Computer-aided detection (CAD) systems have already been proposed for this task. The LUNA16 challenge will focus on a large-scale evaluation of automatic nodule detection algorithms on the publicly available LIDC/IDRI dataset.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import os
import glob
import SimpleITK as sitk
from PIL import Image
from scipy.misc import imread
%matplotlib inline
from IPython.display import clear_output
pd.options.mode.chained_assignment = None
In [3]:
annotations = pd.read_csv('../../input/luna16/annotations.csv')
candidates = pd.read_csv('../../input/luna16/candidates.csv')
In [6]:
annotations.head()
Out[6]:
In [7]:
candidates['class'].sum()
Out[7]:
In [8]:
len(annotations)
Out[8]:
In [9]:
candidates.info()
In [12]:
print(len(candidates[candidates['class'] == 1]))
print(len(candidates[candidates['class'] == 0]))
In [8]:
import multiprocessing
num_cores = multiprocessing.cpu_count()
print num_cores
The best way to move forward will be to undersample the negative class and then augment the positive class heaviliy to balance out the samples.
Get an initial subsample of negative class and keep all of the positives such that we have a 80/20 class distribution
Create a training set such that we augment minority class heavilby rotating to get a 50/50 class distribution
In [9]:
class CTScan(object):
def __init__(self, filename = None, coords = None):
self.filename = filename
self.coords = coords
self.ds = None
self.image = None
def reset_coords(self, coords):
self.coords = coords
def read_mhd_image(self):
path = glob.glob('../data/raw/*/'+ self.filename + '.mhd')
self.ds = sitk.ReadImage(path[0])
self.image = sitk.GetArrayFromImage(self.ds)
def get_resolution(self):
return self.ds.GetSpacing()
def get_origin(self):
return self.ds.GetOrigin()
def get_ds(self):
return self.ds
def get_voxel_coords(self):
origin = self.get_origin()
resolution = self.get_resolution()
voxel_coords = [np.absolute(self.coords[j]-origin[j])/resolution[j] \
for j in range(len(self.coords))]
return tuple(voxel_coords)
def get_image(self):
return self.image
def get_subimage(self, width):
self.read_mhd_image()
x, y, z = self.get_voxel_coords()
subImage = self.image[z, y-width/2:y+width/2, x-width/2:x+width/2]
return subImage
def normalizePlanes(self, npzarray):
maxHU = 400.
minHU = -1000.
npzarray = (npzarray - minHU) / (maxHU - minHU)
npzarray[npzarray>1] = 1.
npzarray[npzarray<0] = 0.
return npzarray
def save_image(self, filename, width):
image = self.get_subimage(width)
image = self.normalizePlanes(image)
Image.fromarray(image*255).convert('L').save(filename)
In [10]:
positives = candidates[candidates['class']==1].index
negatives = candidates[candidates['class']==0].index
In [11]:
scan = CTScan(np.asarray(candidates.iloc[negatives[600]])[0], \
np.asarray(candidates.iloc[negatives[600]])[1:-1])
scan.read_mhd_image()
x, y, z = scan.get_voxel_coords()
image = scan.get_image()
dx, dy, dz = scan.get_resolution()
x0, y0, z0 = scan.get_origin()
In [13]:
filename = '1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208'
coords = (70.19, -140.93, 877.68)#[877.68, -140.93, 70.19]
scan = CTScan(filename, coords)
scan.read_mhd_image()
x, y, z = scan.get_voxel_coords()
image = scan.get_image()
dx, dy, dz = scan.get_resolution()
x0, y0, z0 = scan.get_origin()
In [15]:
positives
Out[15]:
In [16]:
np.random.seed(42)
negIndexes = np.random.choice(negatives, len(positives)*5, replace = False)
In [17]:
candidatesDf = candidates.iloc[list(positives)+list(negIndexes)]
In [18]:
from sklearn.cross_validation import train_test_split
X = candidatesDf.iloc[:,:-1]
y = candidatesDf.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state = 42)
In [20]:
len(X_train)
Out[20]:
In [21]:
X_train.to_pickle('traindata')
X_test.to_pickle('testdata')
X_val.to_pickle('valdata')
In [22]:
def normalizePlanes(npzarray):
maxHU = 400.
minHU = -1000.
npzarray = (npzarray - minHU) / (maxHU - minHU)
npzarray[npzarray>1] = 1.
npzarray[npzarray<0] = 0.
return npzarray
In [23]:
print 'number of positive cases are ' + str(y_train.sum())
print 'total set size is ' + str(len(y_train))
print 'percentage of positive cases are ' + str(y_train.sum()*1.0/len(y_train))
In [24]:
tempDf = X_train[y_train == 1]
tempDf = tempDf.set_index(X_train[y_train == 1].index + 1000000)
X_train_new = X_train.append(tempDf)
tempDf = tempDf.set_index(X_train[y_train == 1].index + 2000000)
X_train_new = X_train_new.append(tempDf)
ytemp = y_train.reindex(X_train[y_train == 1].index + 1000000)
ytemp.loc[:] = 1
y_train_new = y_train.append(ytemp)
ytemp = y_train.reindex(X_train[y_train == 1].index + 2000000)
ytemp.loc[:] = 1
y_train_new = y_train_new.append(ytemp)
print len(X_train_new), len(y_train_new)
In [35]:
X_train_new.index
Out[35]:
In [26]:
from scipy.misc import imresize
from PIL import ImageEnhance
class PreProcessing(object):
def __init__(self, image = None):
self.image = image
def subtract_mean(self):
self.image = (self.image/255.0 - 0.25)*255
return self.image
def downsample_data(self):
self.image = imresize(self.image, size = (40, 40), interp='bilinear', mode='L')
return self.image
def enhance_contrast(self):
self.image = ImageEnhance.Contrast(self.image)
return self.image
In [36]:
dirName = '../src/data/train/'
plt.figure(figsize = (10,10))
inp = imread(dirName + 'image_'+ str(30517) + '.jpg')
plt.subplot(221)
plt.imshow(inp)
plt.grid(False)
Pp = PreProcessing(inp)
inp2 = Pp.subtract_mean()
plt.subplot(222)
plt.imshow(inp2)
plt.grid(False)
#inp4 = Pp.enhance_contrast()
#plt.subplot(224)
#plt.imshow(inp4)
#plt.grid(False)
inp3 = Pp.downsample_data()
plt.subplot(223)
plt.imshow(inp3)
plt.grid(False)
#inp4 = Pp.enhance_contrast()
#plt.subplot(224)
#plt.imshow(inp4)
#plt.grid(False)
In [ ]:
dirName
In [37]:
import tflearn
In [38]:
y_train_new.values.astype(int)
Out[38]:
In [44]:
train_filenames =\
X_train_new.index.to_series().apply(lambda x:\
'../src/data/train/image_'+str(x)+'.jpg')
train_filenames.values.astype(str)
Out[44]:
In [45]:
dataset_file = 'traindatalabels.txt'
train_filenames =\
X_train_new.index.to_series().apply(lambda x:\
filenames = train_filenames.values.astype(str)
labels = y_train_new.values.astype(int)
traindata = np.zeros(filenames.size,\
dtype=[('var1', 'S36'), ('var2', int)])
traindata['var1'] = filenames
traindata['var2'] = labels
np.savetxt(dataset_file, traindata, fmt="%10s %d")
In [48]:
# Build a HDF5 dataset (only required once)
from tflearn.data_utils import build_hdf5_image_dataset
build_hdf5_image_dataset(dataset_file, image_shape=(50, 50), mode='file', output_path='traindataset.h5', categorical_labels=True, normalize=True)
In [63]:
# Load HDF5 dataset
import h5py
h5f = h5py.File('traindataset.h5', 'r')
X_train_images = h5f['X']
Y_train_labels = h5f['Y']
h5f2 = h5py.File('../src/data/valdataset.h5', 'r')
X_val_images = h5f2['X']
Y_val_labels = h5f2['Y']
In [50]:
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
In [51]:
# Make sure the data is normalized
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()
In [52]:
# Create extra synthetic training data by flipping, rotating and blurring the
# images on our data set.
img_aug = ImageAugmentation()
img_aug.add_random_flip_leftright()
img_aug.add_random_rotation(max_angle=25.)
img_aug.add_random_blur(sigma_max=3.)
In [53]:
# Input is a 50x50 image with 1 color channels (grayscale)
network = input_data(shape=[None, 50, 50, 1],
data_preprocessing=img_prep,
data_augmentation=img_aug)
In [54]:
# Step 1: Convolution
network = conv_2d(network, 50, 3, activation='relu')
In [55]:
# Step 2: Max pooling
network = max_pool_2d(network, 2)
In [56]:
# Step 3: Convolution again
network = conv_2d(network, 64, 3, activation='relu')
In [57]:
# Step 4: Convolution yet again
network = conv_2d(network, 64, 3, activation='relu')
In [58]:
# Step 5: Max pooling again
network = max_pool_2d(network, 2)
In [59]:
# Step 6: Fully-connected 512 node neural network
network = fully_connected(network, 512, activation='relu')
In [60]:
# Step 7: Dropout - throw away some data randomly during training to prevent over-fitting
network = dropout(network, 0.5)
In [61]:
# Step 8: Fully-connected neural network with two outputs (0=isn't a nodule, 1=is a nodule) to make the final prediction
network = fully_connected(network, 2, activation='softmax')
In [62]:
# Tell tflearn how we want to train the network
network = regression(network, optimizer='adam',
loss='categorical_crossentropy',
learning_rate=0.001)
In [65]:
# Wrap the network in a model object
model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path='nodule-classifier.tfl.ckpt')
# Train it! We'll do 100 training passes and monitor it as it goes.
model.fit(X_train_images, Y_train_labels, n_epoch=100, shuffle=True, validation_set=(X_val_images, Y_val_labels),
show_metric=True, batch_size=96,
snapshot_epoch=True,
run_id='nodule-classifier')
# Save model when training is complete to a file
model.save("nodule-classifier.tfl")
print("Network trained and saved as nodule-classifier.tfl!")
In [68]:
h5f2 = h5py.File('../src/data/testdataset.h5', 'r')
X_test_images = h5f2['X']
Y_test_labels = h5f2['Y']
In [69]:
model.predict(X_test_images)
In [ ]: