In [1]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.layer as l

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)

X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels


Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz

In [2]:
# Pre-processing: normalizing
def normalize(X):
    # max scale for images 255= 2**8= 8 bit grayscale for each channel
    return (X - X.mean(axis=0)) #/ X.std(axis=0)

X_train, X_val, X_test = normalize(X=X_train), normalize(X=X_val), normalize(X=X_test)

In [3]:
# Model
import impl.layer as l # or from impl.layer import *
from impl.loss import * # import all functions from impl.loss file # import impl.loss as loss_func
from sklearn.utils import shuffle as skshuffle

class FFNN:

    def __init__(self, D, C, H, L):
        self.L = L # layers
        self.C = C # classes
        self.losses = {'train':[], 'train_acc':[], 
                       'valid':[], 'valid_acc':[], 
                       'test':[], 'test_acc':[]}
        
        self.model = []
        self.grads = []
        low, high = -1, 1
        
        # Input layer: weights/ biases
        m = dict(W_pos=np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.), 
                 W_neg=np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.),
                 b_pos=np.zeros((1, H)),
                 b_neg=np.zeros((1, H)))
        self.model.append(m) # model[0]
        # Input layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[0].items()})

        # Hidden layers: weights/ biases
        m_L = []
        for _ in range(L):
            m = dict(W_pos=np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.), 
                     W_neg=np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.),
                     b_pos=np.zeros((1, H)),
                     b_neg=np.zeros((1, H)))
            m_L.append(m)
        self.model.append(m_L) # model[1]
        # Hidden layer: gradients
        grad_L = []
        for _ in range(L):
            grad_L.append({key: np.zeros_like(val) for key, val in self.model[1][0].items()})
        self.grads.append(grad_L)
        
        # Output layer: weights/ biases
        m = dict(W_pos=np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.),
                 W_neg=np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.),
                 b_pos=np.zeros((1, C)),
                 b_neg=np.zeros((1, C)))
        self.model.append(m) # model[2]
        # Output layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[2].items()})
        
    def fc_forward(self, X, W, b):
        out = (X @ W) + b
        cache = (W, X)
        return out, cache

    def fc_backward(self, dout, cache):
        W, X = cache

        dW = X.T @ dout
        db = np.sum(dout, axis=0).reshape(1, -1) # db_1xn        
        dX = dout @ W.T # vanilla Backprop

        return dX, dW, db

    def train_forward(self, X, train):
        caches = []
        
        # Input layer
        y_pos, fc_cache_pos = self.fc_forward(X=(X>0), W=self.model[0]['W_pos'], 
                                              b=self.model[0]['b_pos'])
        y_neg, fc_cache_neg = self.fc_forward(X=(X<0), W=self.model[0]['W_neg'], 
                                              b=self.model[0]['b_neg'])
        y = y_pos + y_neg
        if train:
            caches.append((fc_cache_pos, fc_cache_neg)) # caches[0]
        X = y.copy() # pass to the next layer
        
        # Hidden layers
        fc_caches = []
        for layer in range(self.L):
            y_pos, fc_cache_pos = self.fc_forward(X=(X>0), W=self.model[1][layer]['W_pos'], 
                                                  b=self.model[1][layer]['b_pos'])
            y_neg, fc_cache_neg = self.fc_forward(X=(X<0), W=self.model[1][layer]['W_neg'], 
                                                  b=self.model[1][layer]['b_neg'])
            y = y_pos + y_neg
            if train:
                fc_caches.append((fc_cache_pos, fc_cache_neg))
            X = y.copy() # pass to next layer
        if train:
            caches.append(fc_caches) # caches[1]            
        
        # Output layer
        y_pos, fc_cache_pos = self.fc_forward(X=(X>0), W=self.model[2]['W_pos'], 
                                              b=self.model[2]['b_pos'])
        y_neg, fc_cache_neg = self.fc_forward(X=(X<0), W=self.model[2]['W_neg'], 
                                              b=self.model[2]['b_neg'])
        y = y_pos + y_neg
        y_prob = l.softmax(X=y)
        if train:
            caches.append((fc_cache_pos, fc_cache_neg)) # caches[2]

        return y_prob, caches

    def cross_entropy(self, y_prob, y_train):
        m = y_prob.shape[0]

        #         prob = l.softmax(y_pred)
        log_like = -np.log(y_prob[range(m), y_train]) # to avoid the devision by zero
        data_loss = np.sum(log_like) / m

        return data_loss

    def dcross_entropy(self, y_prob, y_train): # this is equal for both since the reg_loss (noise) derivative is ZERO.
        m = y_prob.shape[0]

        #         grad_y = l.softmax(y_pred)
        grad_y = y_prob
        grad_y[range(m), y_train] -= 1.
        grad_y /= m

        return grad_y

    def loss_function(self, y_prob, y_train):
        
        loss = self.cross_entropy(y_prob, y_train) # softmax is included
        dy = self.dcross_entropy(y_prob, y_train) # dsoftmax is included

        return loss, dy
        
    def train_backward(self, dy, caches):
        grads = self.grads.copy() # initialized by Zero in every iteration/epoch
        
        # Output layer
        fc_cache_pos, fc_cache_neg = caches[2]
        # softmax_backward is included in dcross_entropy.
        dX_pos, dW_pos, db_pos = self.fc_backward(dout=dy, cache=fc_cache_pos)
        dX_neg, dW_neg, db_neg = self.fc_backward(dout=dy, cache=fc_cache_neg)
        dX = dX_pos + dX_neg
        dy = dX.copy()
        grads[2]['W_pos'] = dW_pos
        grads[2]['b_pos'] = db_pos
        grads[2]['W_neg'] = dW_neg
        grads[2]['b_neg'] = db_neg

        # Hidden layer
        fc_caches = caches[1]
        for layer in reversed(range(self.L)):
            fc_cache_pos, fc_cache_neg = fc_caches[layer]
            dX_pos, dW_pos, db_pos = self.fc_backward(dout=dy, cache=fc_cache_pos)
            dX_neg, dW_neg, db_neg = self.fc_backward(dout=dy, cache=fc_cache_neg)
            dX = dX_pos + dX_neg
            dy = dX.copy()
            grads[1][layer]['W_pos'] = dW_pos
            grads[1][layer]['b_pos'] = db_pos
            grads[1][layer]['W_neg'] = dW_neg
            grads[1][layer]['b_neg'] = db_neg
            
        # Input layer
        fc_cache_pos, fc_cache_neg = caches[0]
        dX_pos, dW_pos, db_pos = self.fc_backward(dout=dy, cache=fc_cache_pos)
        dX_neg, dW_neg, db_neg = self.fc_backward(dout=dy, cache=fc_cache_neg)
        dX = dX_pos + dX_neg
        dy = dX.copy()
        grads[0]['W_pos'] = dW_pos
        grads[0]['b_pos'] = db_pos
        grads[0]['W_neg'] = dW_neg
        grads[0]['b_neg'] = db_neg

        return grads
    
    def test(self, X):
        y_prob, _ = self.train_forward(X, train=False)
        
        # if self.mode == 'classification':
        y_pred = np.argmax(y_prob, axis=1) # for loss ==err
        
        return y_pred, y_prob
        
    def get_minibatch(self, X, y, minibatch_size, shuffle):
        minibatches = []

        if shuffle:
            X, y = skshuffle(X, y)

        for i in range(0, X.shape[0], minibatch_size):
            X_mini = X[i:i + minibatch_size]
            y_mini = y[i:i + minibatch_size]
            minibatches.append((X_mini, y_mini))

        return minibatches

    def sgd(self, train_set, val_set, alpha, mb_size, n_iter, print_after):
        X_train, y_train = train_set
        X_val, y_val = val_set

        # Epochs
        for iter in range(1, n_iter + 1):

            # Minibatches
            minibatches = self.get_minibatch(X_train, y_train, mb_size, shuffle=True)
            idx = np.random.randint(0, len(minibatches))
            X_mini, y_mini = minibatches[idx]
            
            # Train the model
            y_prob, caches = self.train_forward(X_mini, train=True)
            _, dy = self.loss_function(y_prob, y_mini)
            grads = self.train_backward(dy, caches)
            
            # Update the model for input layer
            for key in grads[0].keys():
                self.model[0][key] -= alpha * grads[0][key]

            # Update the model for the hidden layers
            for layer in range(self.L):
                for key in grads[1][layer].keys():
                    self.model[1][layer][key] -= alpha * grads[1][layer][key]

            # Update the model for output layer
            for key in grads[2].keys():
                self.model[2][key] -= alpha * grads[2][key]
            
            # Training accuracy
            y_pred, y_prob = self.test(X_mini)
            loss, _ = self.loss_function(y_prob, y_mini) # softmax is included in entropy loss function
            self.losses['train'].append(loss)
            acc = np.mean(y_pred == y_mini) # confusion matrix
            self.losses['train_acc'].append(acc)

            # Validate the updated model
            y_pred, y_prob = self.test(X_val)
            valid_loss, _ = self.loss_function(y_prob, y_val) # softmax is included in entropy loss function
            self.losses['valid'].append(valid_loss)
            valid_acc = np.mean(y_pred == y_val) # confusion matrix
            self.losses['valid_acc'].append(valid_acc)
            
            # Test the final model
            y_pred, y_prob = nn.test(X_test)
            test_loss, _ = self.loss_function(y_prob, y_test) # softmax is included in entropy loss function
            self.losses['test'].append(test_loss)
            test_acc = np.mean(y_pred == y_test)
            self.losses['test_acc'].append(test_acc)
            # print('Test accuracy mean: {:.4f}, std: {:.4f}, loss: {:.4f}'.
            # format(acc.mean(), acc.std(), loss))
            
            # Print the model info: loss & accuracy or err & acc
            if iter % print_after == 0:
                print('Iter-{}, train loss-{:.4f}, acc-{:.4f}, valid loss-{:.4f}, acc-{:.4f}, test loss-{:.4f}, acc-{:.4f}'.format(
                   iter, loss, acc, valid_loss, valid_acc, test_loss, test_acc))

In [4]:
# Hyper-parameters
n_iter = 10000 # number of epochs
alpha = 1e-3 # learning_rate
mb_size = 50 # 2**10==1024 # width, timestep for sequential data or minibatch size
print_after = 100 # n_iter//10 # print loss for train, valid, and test
num_hidden_units = 32 # number of kernels/ filters in each layer
num_input_units = X_train.shape[1] # noise added at the input lavel as input noise we can use dX or for more improvement
num_output_units = y_train.max() + 1 # number of classes in this classification problem
num_layers = 2 # depth 

# Build the model/NN and learn it: running session.
nn = FFNN(C=num_output_units, D=num_input_units, H=num_hidden_units, L=num_layers)

nn.sgd(train_set=(X_train, y_train), val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
           n_iter=n_iter, print_after=print_after)


Iter-100, train loss-2.4998, acc-0.1400, valid loss-2.5139, acc-0.1086, test loss-2.5169, acc-0.1106
Iter-200, train loss-2.5590, acc-0.0800, valid loss-2.5117, acc-0.1072, test loss-2.4993, acc-0.1031
Iter-300, train loss-2.5085, acc-0.1000, valid loss-2.4311, acc-0.0978, test loss-2.4218, acc-0.1018
Iter-400, train loss-2.4153, acc-0.0800, valid loss-2.4075, acc-0.0928, test loss-2.4167, acc-0.0828
Iter-500, train loss-2.3044, acc-0.1800, valid loss-2.3377, acc-0.1132, test loss-2.3347, acc-0.1126
Iter-600, train loss-2.2622, acc-0.1000, valid loss-2.3436, acc-0.0820, test loss-2.3329, acc-0.0890
Iter-700, train loss-2.2992, acc-0.1000, valid loss-2.3045, acc-0.0812, test loss-2.2976, acc-0.0872
Iter-800, train loss-2.3098, acc-0.0200, valid loss-2.3016, acc-0.0248, test loss-2.3023, acc-0.0242
Iter-900, train loss-2.3289, acc-0.0400, valid loss-2.2993, acc-0.0290, test loss-2.3051, acc-0.0278
Iter-1000, train loss-2.2862, acc-0.0400, valid loss-2.2902, acc-0.0342, test loss-2.2969, acc-0.0342
Iter-1100, train loss-2.3132, acc-0.1200, valid loss-2.2776, acc-0.2184, test loss-2.2812, acc-0.2116
Iter-1200, train loss-2.2764, acc-0.1800, valid loss-2.2684, acc-0.2076, test loss-2.2732, acc-0.1977
Iter-1300, train loss-2.2424, acc-0.2000, valid loss-2.2479, acc-0.1530, test loss-2.2545, acc-0.1431
Iter-1400, train loss-2.2739, acc-0.1000, valid loss-2.2434, acc-0.1476, test loss-2.2499, acc-0.1378
Iter-1500, train loss-2.2990, acc-0.1600, valid loss-2.2857, acc-0.2210, test loss-2.2879, acc-0.2082
Iter-1600, train loss-2.1654, acc-0.3200, valid loss-2.2748, acc-0.2258, test loss-2.2749, acc-0.2119
Iter-1700, train loss-2.2685, acc-0.2200, valid loss-2.2740, acc-0.1892, test loss-2.2777, acc-0.1770
Iter-1800, train loss-2.2390, acc-0.2600, valid loss-2.2699, acc-0.2122, test loss-2.2708, acc-0.2100
Iter-1900, train loss-2.2710, acc-0.1800, valid loss-2.2568, acc-0.2126, test loss-2.2646, acc-0.2010
Iter-2000, train loss-2.2940, acc-0.1400, valid loss-2.2507, acc-0.2072, test loss-2.2553, acc-0.2024
Iter-2100, train loss-2.2479, acc-0.2000, valid loss-2.2495, acc-0.1718, test loss-2.2524, acc-0.1690
Iter-2200, train loss-2.2500, acc-0.1600, valid loss-2.2397, acc-0.1760, test loss-2.2397, acc-0.1697
Iter-2300, train loss-2.2334, acc-0.2600, valid loss-2.2391, acc-0.2382, test loss-2.2367, acc-0.2230
Iter-2400, train loss-2.2092, acc-0.2800, valid loss-2.2314, acc-0.2442, test loss-2.2295, acc-0.2295
Iter-2500, train loss-2.2207, acc-0.2800, valid loss-2.2145, acc-0.2370, test loss-2.2202, acc-0.2191
Iter-2600, train loss-2.1736, acc-0.3000, valid loss-2.2172, acc-0.2314, test loss-2.2191, acc-0.2158
Iter-2700, train loss-2.1759, acc-0.2400, valid loss-2.2018, acc-0.2026, test loss-2.2096, acc-0.1867
Iter-2800, train loss-2.1751, acc-0.2000, valid loss-2.2301, acc-0.1966, test loss-2.2371, acc-0.1735
Iter-2900, train loss-2.1862, acc-0.1200, valid loss-2.2194, acc-0.1396, test loss-2.2296, acc-0.1285
Iter-3000, train loss-2.1854, acc-0.2400, valid loss-2.1952, acc-0.2174, test loss-2.2030, acc-0.1999
Iter-3100, train loss-2.2281, acc-0.1200, valid loss-2.1882, acc-0.2176, test loss-2.1931, acc-0.2088
Iter-3200, train loss-2.1528, acc-0.2000, valid loss-2.1735, acc-0.1712, test loss-2.1706, acc-0.1754
Iter-3300, train loss-2.1648, acc-0.2600, valid loss-2.1710, acc-0.2136, test loss-2.1733, acc-0.2147
Iter-3400, train loss-2.1363, acc-0.2400, valid loss-2.1441, acc-0.2178, test loss-2.1455, acc-0.2280
Iter-3500, train loss-2.1382, acc-0.2200, valid loss-2.1511, acc-0.2282, test loss-2.1497, acc-0.2331
Iter-3600, train loss-2.0819, acc-0.3600, valid loss-2.1284, acc-0.2698, test loss-2.1200, acc-0.2870
Iter-3700, train loss-2.2001, acc-0.2200, valid loss-2.1197, acc-0.2998, test loss-2.1127, acc-0.3080
Iter-3800, train loss-2.1264, acc-0.1800, valid loss-2.1232, acc-0.2660, test loss-2.1165, acc-0.2828
Iter-3900, train loss-2.1675, acc-0.2600, valid loss-2.1339, acc-0.2674, test loss-2.1238, acc-0.2885
Iter-4000, train loss-2.1266, acc-0.2000, valid loss-2.1232, acc-0.2728, test loss-2.1183, acc-0.2878
Iter-4100, train loss-2.0589, acc-0.3800, valid loss-2.1260, acc-0.2610, test loss-2.1210, acc-0.2718
Iter-4200, train loss-2.1399, acc-0.2200, valid loss-2.1191, acc-0.2542, test loss-2.1152, acc-0.2679
Iter-4300, train loss-2.1198, acc-0.2400, valid loss-2.1084, acc-0.2632, test loss-2.1110, acc-0.2616
Iter-4400, train loss-2.0466, acc-0.3000, valid loss-2.0959, acc-0.2690, test loss-2.1005, acc-0.2607
Iter-4500, train loss-2.1138, acc-0.3000, valid loss-2.0832, acc-0.2772, test loss-2.0914, acc-0.2765
Iter-4600, train loss-2.0874, acc-0.3200, valid loss-2.0925, acc-0.2810, test loss-2.1037, acc-0.2755
Iter-4700, train loss-2.1508, acc-0.3000, valid loss-2.0641, acc-0.3394, test loss-2.0759, acc-0.3169
Iter-4800, train loss-2.0841, acc-0.3400, valid loss-2.0674, acc-0.3494, test loss-2.0774, acc-0.3382
Iter-4900, train loss-2.0092, acc-0.3200, valid loss-2.0587, acc-0.3494, test loss-2.0690, acc-0.3347
Iter-5000, train loss-2.0488, acc-0.4000, valid loss-2.0552, acc-0.3414, test loss-2.0622, acc-0.3312
Iter-5100, train loss-2.0935, acc-0.2800, valid loss-2.0311, acc-0.3700, test loss-2.0369, acc-0.3631
Iter-5200, train loss-2.0983, acc-0.2400, valid loss-2.0252, acc-0.3554, test loss-2.0295, acc-0.3485
Iter-5300, train loss-2.0014, acc-0.3600, valid loss-2.0265, acc-0.3572, test loss-2.0267, acc-0.3480
Iter-5400, train loss-1.8729, acc-0.4600, valid loss-2.0110, acc-0.3528, test loss-2.0105, acc-0.3523
Iter-5500, train loss-1.9806, acc-0.3200, valid loss-2.0080, acc-0.3450, test loss-2.0056, acc-0.3503
Iter-5600, train loss-1.9997, acc-0.3200, valid loss-1.9880, acc-0.3458, test loss-1.9912, acc-0.3478
Iter-5700, train loss-1.9702, acc-0.3200, valid loss-1.9808, acc-0.3516, test loss-1.9835, acc-0.3474
Iter-5800, train loss-1.8745, acc-0.4000, valid loss-1.9727, acc-0.3574, test loss-1.9725, acc-0.3566
Iter-5900, train loss-2.0163, acc-0.3600, valid loss-1.9603, acc-0.3528, test loss-1.9631, acc-0.3489
Iter-6000, train loss-1.9721, acc-0.3200, valid loss-1.9580, acc-0.3326, test loss-1.9607, acc-0.3337
Iter-6100, train loss-1.9055, acc-0.3400, valid loss-1.9436, acc-0.3396, test loss-1.9513, acc-0.3275
Iter-6200, train loss-1.9780, acc-0.4200, valid loss-1.9345, acc-0.3918, test loss-1.9425, acc-0.3803
Iter-6300, train loss-1.8750, acc-0.4000, valid loss-1.9237, acc-0.3966, test loss-1.9331, acc-0.3880
Iter-6400, train loss-1.7500, acc-0.4600, valid loss-1.9121, acc-0.4022, test loss-1.9204, acc-0.3965
Iter-6500, train loss-1.8627, acc-0.4200, valid loss-1.9073, acc-0.3988, test loss-1.9134, acc-0.3940
Iter-6600, train loss-1.8358, acc-0.5200, valid loss-1.9089, acc-0.4104, test loss-1.9118, acc-0.4071
Iter-6700, train loss-1.9517, acc-0.3400, valid loss-1.9035, acc-0.4014, test loss-1.9072, acc-0.4011
Iter-6800, train loss-1.9167, acc-0.5000, valid loss-1.8810, acc-0.4100, test loss-1.8887, acc-0.4059
Iter-6900, train loss-1.8342, acc-0.3800, valid loss-1.8601, acc-0.4208, test loss-1.8685, acc-0.4144
Iter-7000, train loss-1.7869, acc-0.4600, valid loss-1.8419, acc-0.4298, test loss-1.8518, acc-0.4250
Iter-7100, train loss-1.8665, acc-0.3800, valid loss-1.8341, acc-0.4352, test loss-1.8403, acc-0.4285
Iter-7200, train loss-1.7144, acc-0.4800, valid loss-1.8141, acc-0.4520, test loss-1.8106, acc-0.4578
Iter-7300, train loss-1.6856, acc-0.5600, valid loss-1.7978, acc-0.4566, test loss-1.8012, acc-0.4596
Iter-7400, train loss-1.7163, acc-0.5200, valid loss-1.7867, acc-0.4548, test loss-1.7924, acc-0.4537
Iter-7500, train loss-1.7043, acc-0.4800, valid loss-1.7706, acc-0.4602, test loss-1.7801, acc-0.4644
Iter-7600, train loss-1.7892, acc-0.3800, valid loss-1.7565, acc-0.4466, test loss-1.7656, acc-0.4503
Iter-7700, train loss-1.7925, acc-0.4600, valid loss-1.7398, acc-0.4564, test loss-1.7488, acc-0.4643
Iter-7800, train loss-1.8542, acc-0.3600, valid loss-1.7311, acc-0.4646, test loss-1.7338, acc-0.4665
Iter-7900, train loss-1.7976, acc-0.4400, valid loss-1.7225, acc-0.4750, test loss-1.7261, acc-0.4793
Iter-8000, train loss-1.7185, acc-0.5200, valid loss-1.7156, acc-0.4884, test loss-1.7210, acc-0.4836
Iter-8100, train loss-1.6493, acc-0.5000, valid loss-1.7075, acc-0.4848, test loss-1.7075, acc-0.4845
Iter-8200, train loss-1.6501, acc-0.5800, valid loss-1.6947, acc-0.4828, test loss-1.6949, acc-0.4828
Iter-8300, train loss-1.7999, acc-0.4400, valid loss-1.6859, acc-0.4830, test loss-1.6874, acc-0.4852
Iter-8400, train loss-1.6111, acc-0.5800, valid loss-1.6810, acc-0.4860, test loss-1.6798, acc-0.4839
Iter-8500, train loss-1.7677, acc-0.4200, valid loss-1.6771, acc-0.4834, test loss-1.6771, acc-0.4830
Iter-8600, train loss-1.6909, acc-0.4400, valid loss-1.6732, acc-0.4902, test loss-1.6767, acc-0.4890
Iter-8700, train loss-1.7305, acc-0.4600, valid loss-1.6660, acc-0.4842, test loss-1.6663, acc-0.4863
Iter-8800, train loss-1.5591, acc-0.5600, valid loss-1.6693, acc-0.4730, test loss-1.6690, acc-0.4695
Iter-8900, train loss-1.6739, acc-0.5000, valid loss-1.6537, acc-0.4790, test loss-1.6554, acc-0.4744
Iter-9000, train loss-1.5202, acc-0.5800, valid loss-1.6376, acc-0.4890, test loss-1.6398, acc-0.4920
Iter-9100, train loss-1.6669, acc-0.4000, valid loss-1.6220, acc-0.4878, test loss-1.6277, acc-0.4897
Iter-9200, train loss-1.6539, acc-0.4600, valid loss-1.6224, acc-0.4872, test loss-1.6261, acc-0.4859
Iter-9300, train loss-1.5555, acc-0.5000, valid loss-1.6116, acc-0.4934, test loss-1.6140, acc-0.4927
Iter-9400, train loss-1.5350, acc-0.4800, valid loss-1.5988, acc-0.4946, test loss-1.6054, acc-0.4905
Iter-9500, train loss-1.7059, acc-0.4200, valid loss-1.5889, acc-0.4940, test loss-1.5962, acc-0.4935
Iter-9600, train loss-1.3970, acc-0.5600, valid loss-1.5850, acc-0.4948, test loss-1.5881, acc-0.4957
Iter-9700, train loss-1.7087, acc-0.4400, valid loss-1.5761, acc-0.4894, test loss-1.5819, acc-0.4878
Iter-9800, train loss-1.5360, acc-0.4800, valid loss-1.5742, acc-0.4944, test loss-1.5780, acc-0.4932
Iter-9900, train loss-1.4272, acc-0.5000, valid loss-1.5752, acc-0.4924, test loss-1.5760, acc-0.4954
Iter-10000, train loss-1.3008, acc-0.6000, valid loss-1.5698, acc-0.4866, test loss-1.5706, acc-0.4957

In [5]:
# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(nn.losses['train'], label='Train loss')
plt.plot(nn.losses['valid'], label='Valid loss')
plt.plot(nn.losses['test'], label='Test loss')
plt.legend()
plt.show()



In [6]:
plt.plot(nn.losses['train_acc'], label='Train accuracy')
plt.plot(nn.losses['valid_acc'], label='Valid accuracy')
plt.plot(nn.losses['test_acc'], label='Test accuracy')
plt.legend()
plt.show()



In [ ]: