In [1]:
import datetime
import Image
import gc
import numpy as np # pip install numpy --upgrade
import os
import random
from scipy import misc
import string
import time
import sys
import sklearn.metrics as skm
# Set some Theano config before initializing
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32,allow_gc=False,openmp=True"
import theano
# MatPlotLib - Setup for Jupyter notebook output
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
# Our modules
import dwdii_bc_model_helper as bc
import bc_models as models
# And Keras so we can emit the version
import keras
random.seed(20275)
np.set_printoptions(precision=2)
In [2]:
# Print some upfront version and config settings
print "Python v" + sys.version
print "Numpy v: " + np.__version__
print "keras v: " + keras.__version__
print "device:", theano.config.device
print "floatX:", theano.config.floatX
print "mode:", theano.config.mode
print "openmp:", theano.config.openmp
print "allow_gc:", theano.config.allow_gc
In [3]:
imagePath = "/root/bc_data/ddsm-png.25"
#imagePath = "/root/bc_data/Data_Thresholded/DDSM"
trainImagePath = imagePath
trainDataPath = "../../data/ddsm_train.csv"
# 2 Class Experiment?
# If not, then default is 3 class experiment
normalVsAbnormal = True
categories = bc.bcNumerics()
thesePathos = None #["benign", "malignant"]
if(normalVsAbnormal):
categories = bc.bcNormVsAbnormNumerics()
#
# Simulated training data
#
#trainImagePath = "/root/bc_data/simulated_images"
#trainDataPath = "/root/bc_data/simulated_images/simulated_images.csv"
#trainImagePath = "/root/bc_data/simulated_images_new"
#trainDataPath = "/root/bc_data/simulated_images_new/simulated_images.csv"
# Test data is always from ddsm_test.csv
testDataPath = "../../data/ddsm_test.csv"
imgResize = (150, 150)
In [4]:
os.listdir('../../data')
Out[4]:
In [5]:
metaData, meta2, mCounts = bc.load_training_metadata(trainDataPath, balanceViaRemoval=True, verbose=True,
normalVsAbnormal=normalVsAbnormal)
In [6]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_data, Y_data = bc.load_data(trainDataPath, trainImagePath,
categories=categories,
maxData = maxData,
verboseFreq = 50,
imgResize=imgResize,
thesePathos=thesePathos,
normalVsAbnormal=normalVsAbnormal)
print X_data.shape
print Y_data.shape
In [7]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_test, Y_test = bc.load_data(testDataPath, imagePath,
categories=categories,
maxData = maxData,
verboseFreq = 50,
imgResize=imgResize,
thesePathos=thesePathos,
normalVsAbnormal=normalVsAbnormal)
print X_test.shape
print Y_test.shape
In this section, we will apply transformations to the existing images to increase of training data, as well as add a bit of noise in the hopes of improving the overall training activities.
In [8]:
imgDataGenCount = 3
transformCount = imgDataGenCount
newImgs = np.zeros([X_data.shape[0] * transformCount, X_data.shape[1], X_data.shape[2]])
newYs = np.zeros([Y_data.shape[0] * transformCount, Y_data.shape[1]], dtype=np.int8)
print newImgs.shape
print newYs.shape
In [9]:
img = X_data[0]
img.shape
Out[9]:
In [10]:
ndx = 0
for i in range(X_data.shape[0]):
img = X_data[i]
for n in range(imgDataGenCount):
imgX = models.imageDataGenTransform(img, Y_data[i])
imgX = imgX.reshape(150, 150)
#print imgX.shape
newImgs[ndx] = imgX
newYs[ndx] = Y_data[i]
#misc.imsave("testX.png", imgX)
ndx += 1
#break
print("Done", str(datetime.datetime.now()))
In [11]:
X_data2 = np.concatenate((X_data, newImgs))
Y_data2 = np.concatenate((Y_data, newYs))
print X_data2.shape
print Y_data2.shape
In [12]:
performedTransforms = True
if performedTransforms:
X_train = X_data2
Y_train = Y_data2
else:
X_train = X_data
Y_train = Y_data
In [13]:
print X_train.shape
print X_test.shape
print Y_train.shape
print Y_test.shape
In [14]:
import collections
def yDist(y):
bcCounts = collections.defaultdict(int)
for a in range(0, y.shape[0]):
bcCounts[y[a][0]] += 1
return bcCounts
print "Y_train Dist: " + str(yDist(Y_train))
print "Y_test Dist: " + str(yDist(Y_test))
In [15]:
# Load the bc array for our count in the model definition
print categories
print len(categories)
In [16]:
del sys.modules['bc_models']
In [17]:
# Construct the model using our help function
import bc_models as models
model = models.bc_model_v03(len(categories), verbose=True,
input_shape=(1,X_train.shape[1],X_train.shape[2]))
In [18]:
loadWeights = False
weightsFileName = "dwdii-bc-v03-150-normVsAbnorm-13970-20170510.hdf5"
if loadWeights:
model.load_weights('weights/' + weightsFileName)
In [19]:
# Reshape to the appropriate shape for the CNN input
testX = X_test.reshape(X_test.shape[0], 1, X_test.shape[1],X_test.shape[2])
trainX = X_train.reshape(X_train.shape[0], 1, X_train.shape[1],X_train.shape[2])
In [26]:
print "Training start: " + str(datetime.datetime.now())
m, losses, acc = models.run_network([trainX, testX, Y_train, Y_test], model, batch=50, epochs=20, verbosity=1)
In [21]:
models.plot_losses(losses, acc)
In [22]:
fileLossesPng = '../../figures/plot_losses-' + weightsFileName + '.png'
plt.savefig(fileLossesPng)
In [45]:
model.save_weights('weights/' + weightsFileName, overwrite=True)
Initial results based on "normal" being masked as "benign":
Revised with "normal", "benign" and "malignant" labeled seperately:
After creating fixed "train", "test" and "validate" data sets, using "train" and "test" as well as including the DDSM Benign cases:
bc_model_v01 - categorical_crossentropy
Using the "Data_Thresholded" images
Using the "simulated_images" images
Using the "simulated_images_new" images
In [ ]:
resultsValAcc = {}
#resultsValAcc["1"] = 0.6800
#resultsValAcc["2"] = 0.7260
#resultsValAcc["3"] = 0.6616
#resultsValAcc["03-27-2017"] = 0.6116
#resultsValAcc["04-02-2017"] = 0.4805
#resultsValAcc["04-03-2017"] = 0.5065
#resultsValAcc["04-05-2017"] = 0.5243
resultsValAcc[924] = 0.5628
resultsValAcc[2737] = 0.6326
resultsValAcc[5474] = 0.6138
import dwdii_test as dwdii
#cmp = matplotlib.colors.Colormap("Blues")
dwdii.barChart(resultsValAcc, filename="../../figures/shallowCnn_thresholded_2class_results_valacc.png", title="Shallow CNN - DDSM Data Thresholded 2 Class Test Accuracy", yAxisLabel="Accuracy %")
In [27]:
predictOutput = model.predict(testX, batch_size=32, verbose=1)
In [28]:
predClass = np.array(predictOutput[0]).argmax()
numBC = bc.reverseDict(categories)
numBC[predClass]
Out[28]:
In [29]:
numBC[Y_test[0][0]]
Out[29]:
In [30]:
predClasses = []
for i in range(len(predictOutput)):
arPred = np.array(predictOutput[i])
predictionProb = arPred.max()
predictionNdx = arPred.argmax()
predClassName = numBC[predictionNdx]
predClasses.append(predictionNdx)
#print "{0}: {1} ({2})".format(i, predClassName, predictionProb)
In [31]:
# Use sklearn's helper method to generate the confusion matrix
cnf_matrix = skm.confusion_matrix(Y_test, predClasses)
cnf_matrix
Out[31]:
In [37]:
class_names = numBC.values()
print class_names[1:3]
print class_names
np.set_printoptions(precision=2)
In [38]:
# Plot non-normalized confusion matrix
fileCfMatrix = '../../figures/confusion_matrix-' + weightsFileName + '.png'
plt.figure()
bc.plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization, \n' + weightsFileName)
plt.savefig(fileCfMatrix)
In [39]:
# Load the image we just saved
from IPython.display import Image
Image(filename=fileCfMatrix)
Out[39]:
In [43]:
# Plot normalized confusion matrix
fileCfMatrixNorm = '../../figures/confusion_matrix_norm-' + weightsFileName + '.png'
plt.figure()
bc.plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix \n' + weightsFileName)
plt.savefig(fileCfMatrixNorm)
In [44]:
# Load the image we just saved
from IPython.display import Image
Image(filename=fileCfMatrixNorm)
Out[44]:
In [ ]: