This notebook is going to look at the mistakes made by the chosen model. Specifically, it's going to look at the log loss for each class on the validation set and produce a Hinton diagram for all the classes to show the confusion matrix for this model. Then, it will find the class performing worst and plot example images from this class after processing.
Aiming for it to be written in such a way that the model file chosen can just be swapped in, so we can try it on different models.
Doing the exact same thing as in the notebook on Holdout set testing, creating a set of predictions:
In [3]:
# copying imports from holdout notebook...
import pylearn2.utils
import pylearn2.config
import theano
import neukrill_net.dense_dataset
import neukrill_net.utils
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import holoviews as hl
%load_ext holoviews.ipython
import sklearn.metrics
In [4]:
cd ..
In [5]:
settings = neukrill_net.utils.Settings("settings.json")
run_settings = neukrill_net.utils.load_run_settings(
"run_settings/alexnet_based_norm_global_8aug.json", settings, force=True)
In [6]:
# loading the model
model = pylearn2.utils.serial.load(run_settings['pickle abspath'])
In [7]:
# loading the data
dataset = neukrill_net.dense_dataset.DensePNGDataset(settings_path=run_settings['settings_path'],
run_settings=run_settings['run_settings_path'],
train_or_predict='train',
training_set_mode='validation', force=True)
In [8]:
batch_size=500
while dataset.X.shape[0]%batch_size != 0:
batch_size += 1
n_batches = int(dataset.X.shape[0]/batch_size)
# set this batch size
model.set_batch_size(batch_size)
# compile Theano function
X = model.get_input_space().make_batch_theano()
Y = model.fprop(X)
f = theano.function([X],Y)
In [9]:
%%time
y = np.zeros((dataset.X.shape[0],len(settings.classes)))
for i in xrange(n_batches):
print("Batch {0} of {1}".format(i+1,n_batches))
x_arg = dataset.X[i*batch_size:(i+1)*batch_size,:]
if X.ndim > 2:
x_arg = dataset.get_topological_view(x_arg)
y[i*batch_size:(i+1)*batch_size,:] = (f(x_arg.astype(X.dtype).T))
In [10]:
af = run_settings.get("augmentation_factor",1)
if af > 1:
y_collapsed = np.zeros((int(dataset.X.shape[0]/af), len(settings.classes)))
for i,(low,high) in enumerate(zip(range(0,dataset.y.shape[0],af),
range(af,dataset.y.shape[0]+af,af))):
y_collapsed[i,:] = np.mean(y[low:high,:], axis=0)
y = y_collapsed
labels = dataset.y[range(0,dataset.y.shape[0],af)]
else:
labels = dataset.y
In [11]:
import sklearn.metrics
In [12]:
# split predictions by class and calculate log loss
results = {}
uniform = {}
for c in range(121):
class_mask = labels.ravel() == c
l = np.zeros(y.shape[0])
l[np.where(class_mask)] = 1
p = np.zeros((y.shape[0],2))
p[:,1] = sum(class_mask)/len(class_mask)
p[:,0] = 1-p[:,1]
uniform[c] = sklearn.metrics.log_loss(l,p)
p[:,1] = y[:,c]
p[:,0] = 1-p[:,1]
results[c] = sklearn.metrics.log_loss(l,p)
Comparing the score of the network against a uniform benchmark on a scatter plot:
In [13]:
plt.plot(range(121),[results[c] for c in range(121)], c="red", label="Network")
plt.plot(range(121),[uniform[c] for c in range(121)], c="blue", label="Prior")
plt.yscale("log")
l=plt.legend()
May be better to take the improvement on uniform as a score:
In [14]:
differences = np.array([uniform[c]-results[c] for c in range(121)])
plt.plot(range(121),differences, c="green")
plt.title("Difference between network and uniform benchmark across classes")
plt.yscale("log")
Finding out which is the worst:
In [15]:
worst = np.where(differences == max(differences))[0][0]
print("Worst performance on: {0}".format(settings.classes[worst]))
What's it being confused with?
In [16]:
inds = []
confdict = {}
for i,w in enumerate(labels == worst):
if w:
r = y[i,:]
predicted = np.where(r == max(r))[0]
if predicted != worst:
inds.append(i)
print("{0} confused with: {1}".format(settings.classes[worst],
settings.classes[predicted]))
confdict[i] = str(settings.classes[predicted][0].upper()
+settings.classes[predicted][1:])
inds = np.array(inds)
In [17]:
inds
Out[17]:
In [18]:
example_images = dataset.get_topological_view(dataset.X[inds*8,:]).reshape(-1,48,48)
In [19]:
confdict
Out[19]:
In [20]:
%opts Image style(cmap='gray')
channels = hl.Image(example_images[0,:,:],label=confdict[inds[0]])
for i,j in enumerate(inds[1:]):
channels = channels + hl.Image(example_images[i],label=confdict[j])
print("These are all trichodesmium_puffs, labels indicate what they were confused with.")
channels
Out[20]:
In [21]:
confusion = []
for r in y[inds,:]:
predicted = np.where(r == max(r))[0][0]
confusion.append(predicted)
In [22]:
channels = None
for c in list(set(confusion)):
np.random.seed(42)
cinds = np.where(labels == c)[0]
cinds.setflags(write=True)
np.random.shuffle(cinds)
cinds = cinds[:4]
c_examples = dataset.get_topological_view(
dataset.X[cinds*8,:]).reshape(4,48,48)
cs = str(settings.classes[c])
cs = cs[:1].upper() + cs[1:]
for i in range(4):
if not channels:
channels = hl.Image(c_examples[i,:,:], label=cs)
else:
channels = channels + hl.Image(c_examples[i,:,:], label=cs)
channels
Out[22]:
In [23]:
N = y.shape[0]
In [24]:
logloss = lambda x: -(1./N)*np.log(x[0][x[1]])[0]
Checking I've written this right:
In [25]:
sum(map(logloss,zip(y,labels)))
Out[25]:
In [26]:
sklearn.metrics.log_loss(labels,y)
Out[26]:
Now we can iterate over all the datapoints and find the worst:
In [27]:
ll = []
for i,(p,l) in enumerate(zip(y,labels)):
ll.append((i,logloss((p,l))))
In [28]:
ll.sort(key=lambda x: x[-1])
In [29]:
# interesting index to happen
worst = ll[-1][0]
ll[-1]
Out[29]:
Now, we want to plot this index along with examples of what it's being confused with. First, find the class it's being confused with and get some indices for that:
In [30]:
real = dataset.get_topological_view(dataset.X[worst*8:(worst*8)+1,:]).squeeze()
realname = settings.classes[labels[worst][0]]
# capitalise for holoviews...
realname = str(realname[0].upper() + realname[1:])
In [31]:
confused = np.where(y[worst] == max(y[worst]))[0][0]
In [32]:
np.where(labels == confused)[0]
Out[32]:
In [33]:
settings.classes[confused]
Out[33]:
In [34]:
cexamples = dataset.get_topological_view(dataset.X[np.where(labels == confused)[0]*8,:]).squeeze()
In [35]:
channels = hl.Image(real,group=realname)
for c in cexamples:
cs = str(settings.classes[confused])
cs = cs[:1].upper() + cs[1:]
channels += hl.Image(c, group=cs)
channels
Out[35]:
I guess the question is, how is the network supposed to be able to tell the difference in this case? More resolution?
Plotting the above with raw images:
In [36]:
rawX,_ = neukrill_net.utils.load_rawdata(dataset.settings.image_fnames,classes=settings.classes)
In [37]:
cexamples = [rawX[i] for i in np.where(labels == confused)[0]]
real = rawX[worst]
In [116]:
channels = hl.Image(real,group=realname)
for c in cexamples:
cs = str(settings.classes[confused])
cs = cs[:1].upper() + cs[1:]
channels += hl.Image(c, group=cs)
channels
Out[116]:
Yeah, looks like our resizing is losing some of the fine-grained detail. Although, that protist fuzzy olive that was the worst has changed relatively little; it was already fuzzy.
In [90]:
# have to discretize y... not comfortable with this
discy = []
for p in y:
discy.append(np.where(p == max(p))[0][0])
discy = np.array(discy)
In [91]:
discy
Out[91]:
In [92]:
labels.ravel()
Out[92]:
In [107]:
confmat = sklearn.metrics.confusion_matrix(labels.ravel(),discy)
In [108]:
confmat
Out[108]:
In [109]:
np.sum(confmat,axis=1)
Out[109]:
In [110]:
confmat = confmat.astype(np.float)
for i,r in enumerate(np.sum(confmat, axis=1)):
confmat[i] = confmat[i]*(1/r)
In [111]:
confmat
Out[111]:
In [59]:
import neukrill_net.hinton_diagram
In [113]:
neukrill_net.hinton_diagram.hinton(confmat)
Where are the big false positives? We can make a new array just including those, in a configuration we know?
In [114]:
# removing diagonal elements:
diags = np.where(np.eye(len(settings.classes)) == 1)
ndconfmat = confmat[:]
ndconfmat[diags[0],diags[1]] = 0
neukrill_net.hinton_diagram.hinton(ndconfmat)
In [73]:
# which elements are the biggest?
for m in np.sort(ndconfmat.ravel())[-10:]:
print("{0} at {1}".format(m,np.where(ndconfmat==m)))
So confusion is greatest between classes 10 and 11? We could plot examples of those two classes:
In [48]:
c10inds = np.where(labels==10)[0]
c11inds = np.where(labels==11)[0]
c10name = str(settings.classes[10][0].upper()+settings.classes[10][1:])
c11name = str(settings.classes[11][0].upper()+settings.classes[11][1:])
In [49]:
example = dataset.get_topological_view(
dataset.X[c10inds[-2]*8:(c10inds[-2]*8)+1,:]).squeeze()
channels = hl.Image(example, label=c10name)
for i,j in zip(c10inds[:10],c11inds[:10]):
# because I want them interleaved
for k in ((i, c10name),(j, c11name)):
example = dataset.get_topological_view(
dataset.X[k[0]*8:(k[0]*8)+1,:]).squeeze()
channels += hl.Image(example, group=k[1])
channels
Out[49]: