Debugging MNIST Digit Classification - Fully Connected Network


In [1]:
from __future__ import division, print_function
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers.core import Dense, Dropout
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline


Using TensorFlow backend.

In [2]:
DATA_DIR = "../../data"
TRAIN_FILE = os.path.join(DATA_DIR, "mnist_train.csv")
TEST_FILE = os.path.join(DATA_DIR, "mnist_test.csv")

BATCH_SIZE = 128
NUM_CLASSES = 10
NUM_EPOCHS = 2

Prepare Data


In [3]:
def parse_file(filename):
    xdata, ydata = [], []
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        if i % 10000 == 0:
            print("{:s}: {:d} lines read".format(
                os.path.basename(filename), i))
        cols = line.strip().split(",")
        ydata.append(int(cols[0]))
        xdata.append([float(x) / 255. for x in cols[1:]])
        i += 1
    fin.close()
    print("{:s}: {:d} lines read".format(os.path.basename(filename), i))
    Y = np_utils.to_categorical(np.array(ydata), num_classes=NUM_CLASSES)
    X = np.array(xdata)
    return X, Y

Xtrain, Ytrain = parse_file(TRAIN_FILE)
Xtest, Ytest = parse_file(TEST_FILE)
print(Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape)


mnist_train.csv: 0 lines read
mnist_train.csv: 10000 lines read
mnist_train.csv: 20000 lines read
mnist_train.csv: 30000 lines read
mnist_train.csv: 40000 lines read
mnist_train.csv: 50000 lines read
mnist_train.csv: 60000 lines read
mnist_test.csv: 0 lines read
mnist_test.csv: 10000 lines read
(60000, 784) (60000, 10) (10000, 784) (10000, 10)

Define Network

Model is identical to that defined in Keras example mnist_mlp.py.


In [4]:
model = Sequential()
model.add(Dense(512, activation="relu", input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="softmax"))

In [5]:
model.compile(optimizer="adam", loss="categorical_crossentropy", 
              metrics=["accuracy"])

Network Summary


In [6]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 512)               401920    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                2570      
=================================================================
Total params: 535,818
Trainable params: 535,818
Non-trainable params: 0
_________________________________________________________________

In [7]:
for layer in model.layers:
    print(layer.name, layer.input.shape, layer.output.shape)


dense_1 (?, 784) (?, 512)
dropout_1 (?, 512) (?, 512)
dense_2 (?, 512) (?, 256)
dropout_2 (?, 256) (?, 256)
dense_3 (?, 256) (?, 10)

Weights Callback


In [8]:
from keras import backend as K
from keras.callbacks import Callback
            
def calc_stats(W):
    return np.linalg.norm(W, 2), np.mean(W), np.std(W)

class MyDebugWeights(Callback):
    
    def __init__(self):
        super(MyDebugWeights, self).__init__()
        self.weights = []
        self.tf_session = K.get_session()
            
    def on_epoch_end(self, epoch, logs=None):
        for layer in self.model.layers:
            name = layer.name
            for i, w in enumerate(layer.weights):
                w_value = w.eval(session=self.tf_session)
                w_norm, w_mean, w_std = calc_stats(np.reshape(w_value, -1))
                self.weights.append((epoch, "{:s}/W_{:d}".format(name, i), 
                                     w_norm, w_mean, w_std))
    
    def on_train_end(self, logs=None):
        for e, k, n, m, s in self.weights:
            print("{:3d} {:20s} {:7.3f} {:7.3f} {:7.3f}".format(e, k, n, m, s))

Train Network, collect weights


In [22]:
my_debug_weights = MyDebugWeights()
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS,
                    validation_split=0.1,
                    callbacks=[my_debug_weights])

# Train on 54000 samples, validate on 6000 samples
# Epoch 1/2
# 54000/54000 [==============================] - 4s - loss: 0.2830 - acc: 0.9146 - val_loss: 0.0979 - val_acc: 0.9718
# Epoch 2/2
# 54000/54000 [==============================] - 3s - loss: 0.1118 - acc: 0.9663 - val_loss: 0.0758 - val_acc: 0.9773
#   0 dense_1/W_0           28.236  -0.002   0.045
#   0 dense_1/W_1            0.283   0.003   0.012
#   0 dense_2/W_0           20.631   0.002   0.057
#   0 dense_2/W_1            0.205   0.008   0.010
#   0 dense_3/W_0            4.962  -0.005   0.098
#   0 dense_3/W_1            0.023  -0.001   0.007
#   1 dense_1/W_0           30.455  -0.003   0.048
#   1 dense_1/W_1            0.358   0.003   0.016
#   1 dense_2/W_0           21.989   0.002   0.061
#   1 dense_2/W_1            0.273   0.010   0.014
#   1 dense_3/W_0            5.282  -0.008   0.104
#   1 dense_3/W_1            0.040  -0.002   0.013

# Train on 54000 samples, validate on 6000 samples
# Epoch 1/2
# 54000/54000 [==============================] - 3s - loss: 0.0796 - acc: 0.9753 - val_loss: 0.0658 - val_acc: 0.9820
# Epoch 2/2
# 54000/54000 [==============================] - 3s - loss: 0.0607 - acc: 0.9803 - val_loss: 0.0745 - val_acc: 0.9803
#   0 dense_1/W_0           32.546  -0.004   0.051
#   0 dense_1/W_1            0.430   0.002   0.019
#   0 dense_2/W_0           23.191   0.001   0.064
#   0 dense_2/W_1            0.338   0.011   0.018
#   0 dense_3/W_0            5.535  -0.009   0.109
#   0 dense_3/W_1            0.060  -0.003   0.019
#   1 dense_1/W_0           34.445  -0.004   0.054
#   1 dense_1/W_1            0.490   0.001   0.022
#   1 dense_2/W_0           24.277   0.001   0.067
#   1 dense_2/W_1            0.420   0.012   0.023
#   1 dense_3/W_0            5.758  -0.011   0.113
#   1 dense_3/W_1            0.081  -0.004   0.025

# Train on 54000 samples, validate on 6000 samples
# Epoch 1/2
# 54000/54000 [==============================] - 3s - loss: 0.0473 - acc: 0.9854 - val_loss: 0.0686 - val_acc: 0.9807
# Epoch 2/2
# 54000/54000 [==============================] - 3s - loss: 0.0414 - acc: 0.9864 - val_loss: 0.0668 - val_acc: 0.9808
#   0 dense_1/W_0           36.141  -0.005   0.057
#   0 dense_1/W_1            0.551   0.000   0.024
#   0 dense_2/W_0           25.319   0.001   0.070
#   0 dense_2/W_1            0.497   0.014   0.028
#   0 dense_3/W_0            5.970  -0.012   0.117
#   0 dense_3/W_1            0.098  -0.004   0.031
#   1 dense_1/W_0           37.838  -0.005   0.060
#   1 dense_1/W_1            0.600  -0.001   0.026
#   1 dense_2/W_0           26.329   0.000   0.073
#   1 dense_2/W_1            0.567   0.014   0.033
#   1 dense_3/W_0            6.136  -0.014   0.121
#   1 dense_3/W_1            0.117  -0.005   0.037

# Train on 54000 samples, validate on 6000 samples
# Epoch 1/2
# 54000/54000 [==============================] - 3s - loss: 0.0354 - acc: 0.9876 - val_loss: 0.0619 - val_acc: 0.9837
# Epoch 2/2
# 54000/54000 [==============================] - 3s - loss: 0.0336 - acc: 0.9890 - val_loss: 0.0634 - val_acc: 0.9840
#   0 dense_1/W_0           39.366  -0.005   0.062
#   0 dense_1/W_1            0.638  -0.002   0.028
#   0 dense_2/W_0           27.335  -0.000   0.076
#   0 dense_2/W_1            0.643   0.014   0.038
#   0 dense_3/W_0            6.311  -0.015   0.124
#   0 dense_3/W_1            0.135  -0.005   0.043
#   1 dense_1/W_0           40.956  -0.006   0.064
#   1 dense_1/W_1            0.702  -0.003   0.031
#   1 dense_2/W_0           28.255  -0.001   0.078
#   1 dense_2/W_1            0.713   0.015   0.042
#   1 dense_3/W_0            6.442  -0.016   0.126
#   1 dense_3/W_1            0.150  -0.006   0.047

# Train on 54000 samples, validate on 6000 samples
# Epoch 1/2
# 54000/54000 [==============================] - 3s - loss: 0.0280 - acc: 0.9906 - val_loss: 0.0732 - val_acc: 0.9818
# Epoch 2/2
# 54000/54000 [==============================] - 3s - loss: 0.0265 - acc: 0.9909 - val_loss: 0.0727 - val_acc: 0.9812
#   0 dense_1/W_0           42.340  -0.006   0.067
#   0 dense_1/W_1            0.733  -0.005   0.032
#   0 dense_2/W_0           29.111  -0.001   0.080
#   0 dense_2/W_1            0.776   0.015   0.046
#   0 dense_3/W_0            6.574  -0.017   0.129
#   0 dense_3/W_1            0.161  -0.006   0.051
#   1 dense_1/W_0           43.770  -0.007   0.069
#   1 dense_1/W_1            0.781  -0.005   0.034
#   1 dense_2/W_0           30.020  -0.002   0.083
#   1 dense_2/W_1            0.874   0.016   0.052
#   1 dense_3/W_0            6.704  -0.018   0.131
#   1 dense_3/W_1            0.189  -0.007   0.059


Train on 54000 samples, validate on 6000 samples
Epoch 1/2
54000/54000 [==============================] - 3s - loss: 0.0280 - acc: 0.9906 - val_loss: 0.0732 - val_acc: 0.9818
Epoch 2/2
54000/54000 [==============================] - 3s - loss: 0.0265 - acc: 0.9909 - val_loss: 0.0727 - val_acc: 0.9812
  0 dense_1/W_0           42.340  -0.006   0.067
  0 dense_1/W_1            0.733  -0.005   0.032
  0 dense_2/W_0           29.111  -0.001   0.080
  0 dense_2/W_1            0.776   0.015   0.046
  0 dense_3/W_0            6.574  -0.017   0.129
  0 dense_3/W_1            0.161  -0.006   0.051
  1 dense_1/W_0           43.770  -0.007   0.069
  1 dense_1/W_1            0.781  -0.005   0.034
  1 dense_2/W_0           30.020  -0.002   0.083
  1 dense_2/W_1            0.874   0.016   0.052
  1 dense_3/W_0            6.704  -0.018   0.131
  1 dense_3/W_1            0.189  -0.007   0.059

Collect Intermediate Outputs


In [23]:
def get_outputs(inputs, model):
    layer_01_fn = K.function([model.layers[0].input, K.learning_phase()], 
                             [model.layers[1].output]) 
    layer_23_fn = K.function([model.layers[2].input, K.learning_phase()],
                             [model.layers[3].output])
    layer_44_fn = K.function([model.layers[4].input, K.learning_phase()],
                             [model.layers[4].output])
    layer_1_out = layer_01_fn([inputs, 1])[0]
    layer_3_out = layer_23_fn([layer_1_out, 1])[0]
    layer_4_out = layer_44_fn([layer_3_out, 1])[0]
    return layer_1_out, layer_3_out, layer_4_out

out_1, out_3, out_4 = get_outputs(Xtest[0:10], model)
print("out_1", calc_stats(out_1))
print("out_3", calc_stats(out_3))
print("out_4", calc_stats(out_4))

# out_1 (15.320195, 0.15846619, 0.36553052)
# out_3 (31.983685, 0.52617866, 0.82984859)
# out_4 (1.4138139, 0.1, 0.29160777)

# out_1 (15.458527, 0.15253167, 0.38208964)
# out_3 (33.913242, 0.54224658, 0.90698332)
# out_4 (1.4142052, 0.1, 0.28973988)

# out_1 (16.639494, 0.15411146, 0.41647691)
# out_3 (35.837318, 0.58614647, 0.99438524)
# out_4 (1.4156684, 0.1, 0.29898632)

# out_1 (16.877953, 0.15098023, 0.43457347)
# out_3 (36.548904, 0.59088105, 1.0605338)
# out_4 (1.414073, 0.1, 0.29486296)

# out_1 (18.307556, 0.16563581, 0.47472247)
# out_3 (42.404495, 0.64846009, 1.242806)
# out_4 (1.4240878, 0.1, 0.29720506)


out_1 (18.307556, 0.16563581, 0.47472247)
out_3 (42.404495, 0.64846009, 1.242806)
out_4 (1.4240878, 0.1, 0.29720506)

Collect Intermediate Gradients


In [24]:
def get_gradients(inputs, labels, model):
    opt = model.optimizer
    loss = model.total_loss
    weights = model.weights
    grads = opt.get_gradients(loss, weights)
    grad_fn = K.function(inputs=[model.inputs[0], 
                                 model.sample_weights[0],
                                 model.targets[0],
                                 K.learning_phase()], 
                         outputs=grads)
    grad_values = grad_fn([inputs, np.ones(len(inputs)), labels, 1])
    return grad_values

gradients = get_gradients(Xtest[0:10], Ytest[0:10], model)
for i in range(len(gradients)):
    print("grad_{:d}".format(i), calc_stats(gradients[i]))

# grad_0 (1.7725379, 1.1711028e-05, 0.0028093776)
# grad_1 (0.17403033, 3.4195516e-05, 0.0076910509)
# grad_2 (1.2508092, -7.3888972e-05, 0.003460743)
# grad_3 (0.12154519, -0.00047613602, 0.0075816377)
# grad_4 (1.5319482, 4.8748915e-11, 0.030318365)
# grad_5 (0.10286356, -4.6566129e-11, 0.032528315)

# grad_0 (3.4017127, 8.7506611e-05, 0.0053710202)
# grad_1 (0.33252886, 0.00055375684, 0.014685402)
# grad_2 (1.9467239, -3.3674216e-05, 0.0053783408)
# grad_3 (0.16811177, -0.00019758131, 0.010505128)
# grad_4 (1.8920149, -3.4779077e-10, 0.037405979)
# grad_5 (0.11266962, -2.7939678e-10, 0.035629261)

# grad_0 (4.4856653, 0.00014608752, 0.0070793224)
# grad_1 (0.43840903, 0.00093970483, 0.019352324)
# grad_2 (2.4390073, 9.5780408e-05, 0.006736787)
# grad_3 (0.19859995, 0.00049467472, 0.012402636)
# grad_4 (2.9728518, -1.4736087e-10, 0.058762152)
# grad_5 (0.13749355, -6.9849196e-11, 0.043479279)

# grad_0 (0.94408065, 5.3343301e-06, 0.0014902415)
# grad_1 (0.092352077, 3.1091229e-05, 0.0040813056)
# grad_2 (0.57179779, -2.3590032e-05, 0.0015793034)
# grad_3 (0.043331128, -0.00013161075, 0.0027049957)
# grad_4 (0.63560385, 1.2043984e-10, 0.012562943)
# grad_5 (0.028290441, -1.7062121e-10, 0.0089462232)

# grad_0 (4.5891175, -7.2404553e-05, 0.0072430321)
# grad_1 (0.44867462, -0.00047407666, 0.019823136)
# grad_2 (2.6217206, 2.5737674e-05, 0.0072415713)
# grad_3 (0.18206903, 0.00012690801, 0.011378606)
# grad_4 (3.2452161, -2.9717739e-11, 0.064140067)
# grad_5 (0.12291637, -3.783498e-10, 0.038869571)


grad_0 (4.5891175, -7.2404553e-05, 0.0072430321)
grad_1 (0.44867462, -0.00047407666, 0.019823136)
grad_2 (2.6217206, 2.5737674e-05, 0.0072415713)
grad_3 (0.18206903, 0.00012690801, 0.011378606)
grad_4 (3.2452161, -2.9717739e-11, 0.064140067)
grad_5 (0.12291637, -3.783498e-10, 0.038869571)

In [ ]: