In [1]:
# This is mostly implemented by Kristadi who was student in Standford by Fei Fei Li and Karpathy and Justin Justin
# The standford assignemtn regarding CNN and image classification is also very mich similar and even more complete than this.
# In this sense it is a great learning resource with differemnt implementation from NumPy, Cython, TensorFlow and etc..

In [2]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.layer as l

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('/home/arasdar/datasets/MNIST_data/', one_hot=False)

X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
y_test.shape, y_val.shape, y_train.shape


WARNING:tensorflow:From <ipython-input-2-2befda06e953>:7: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
WARNING:tensorflow:From /home/arasdar/anaconda3/envs/env/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
WARNING:tensorflow:From /home/arasdar/anaconda3/envs/env/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /home/arasdar/datasets/MNIST_data/train-images-idx3-ubyte.gz
WARNING:tensorflow:From /home/arasdar/anaconda3/envs/env/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /home/arasdar/datasets/MNIST_data/train-labels-idx1-ubyte.gz
Extracting /home/arasdar/datasets/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting /home/arasdar/datasets/MNIST_data/t10k-labels-idx1-ubyte.gz
WARNING:tensorflow:From /home/arasdar/anaconda3/envs/env/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Out[2]:
((10000,), (5000,), (55000,))

In [3]:
def get_im2col_indices(X_shape, field_height, field_width, padding=1, stride=1):
    # First figure out what the size of the output should be
    # Input shape
    N, C, H, W = X_shape
    
    # Kernel shape
    # field_height, field_width = kernel_shape
    field_C = C
    
    # Output shape
    assert (H + (2 * padding) - field_height) % stride == 0
    assert (W + (2 * padding) - field_width) % stride == 0
    out_height = int(((H + (2 * padding) - field_height) / stride) + 1)
    out_width = int(((W + (2 * padding) - field_width) / stride) + 1)
    out_C = 1 # the output channel/ depth

    # Row, Height, i
    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, field_C)
    i1 = np.repeat(np.arange(out_height), out_width)
    i1 = np.tile(i1, out_C)
    i1 *= stride
    
    # Column, Width, j
    j0 = np.tile(np.arange(field_width), field_height * field_C)
    j1 = np.tile(np.arange(out_width), out_height * out_C)
    j1 *= stride
    
    # Channel, Depth, K
    k0 = np.repeat(np.arange(field_C), field_height * field_width) #.reshape(-1, 1) # out_C = 1
    k1 = np.repeat(np.arange(out_C), out_height * out_width) #.reshape(-1, 1) # out_C = 1
    k1 *= stride
    
    # Indices: i, j, k index
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = k0.reshape(-1, 1) + k1.reshape(1, -1)
    
    return (k.astype(int), i.astype(int), j.astype(int))

In [4]:
def im2col_indices(X, field_height, field_width, padding=1, stride=1):
    """ An implementation of im2col based on some fancy indexing """
    # Zero-pad the input
    p = padding
    X_padded = np.pad(X, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant') # X_NxCxHxW

    k, i, j = get_im2col_indices(X.shape, field_height, field_width, padding, stride)

    X_col = X_padded[:, k, i, j] # X_col_txkxn
    
    N, C, H, W = X.shape
    
    # field_height, field_width = kernel_shape
    field_C = C # x.shape[1]
    kernel_size = field_C * field_height * field_width
    
    X_col = X_col.transpose(1, 2, 0).reshape(kernel_size, -1)
    
    return X_col

In [5]:
def col2im_indices(X_col, X_shape, field_height=3, field_width=3, padding=1, stride=1):
    """ An implementation of col2im based on fancy indexing and np.add.at """
    N, C, H, W = X_shape
    H_padded, W_padded = H + (2 * padding), W + (2 * padding)
    X_padded = np.zeros((N, C, H_padded, W_padded), dtype=X_col.dtype)
    
    k, i, j = get_im2col_indices(X_shape, field_height, field_width, padding, stride)

    # field_height, field_width = kernel_shape
    field_C = C # x.shape[1]
    kernel_size = field_C * field_height * field_width

    X_col = X_col.reshape(kernel_size, -1, N).transpose(2, 0, 1) # N, K, H * W
    np.add.at(X_padded, (slice(None), k, i, j), X_col) # slice(None)== ':'
    # # np.add.at(a, (slice(None), 0, 1, 2), cols_reshaped) # slice(None)== ':'
    
    if padding > 0:
        X = X_padded[:, :, padding:-padding, padding:-padding]
    else:
        X = X_padded[:, :, :, :]
        
    return X

In [6]:
def conv_forward(X, W, b, stride=1, padding=1):
    cache = W, b, stride, padding
    
    # Input X
    n_x, d_x, h_x, w_x = X.shape
    
    # Kernel W
    n_filter, d_filter, h_filter, w_filter = W.shape
    
    # Output
    h_out = ((h_x + (2 * padding) - h_filter) / stride) + 1
    w_out = ((w_x + (2 * padding) - w_filter) / stride) + 1

    if not h_out.is_integer() or not w_out.is_integer():
        raise Exception('Invalid output dimension!')

    h_out, w_out = int(h_out), int(w_out)

    X_col = im2col_indices(X, h_filter, w_filter, padding=padding, stride=stride)
    W_col = W.reshape(n_filter, -1)

    out = (W_col @ X_col) + b
    out = out.reshape(n_filter, h_out, w_out, n_x).transpose(3, 0, 1, 2)
    cache = (X, W, b, stride, padding, X_col)

    return out, cache

In [7]:
def conv_backward(dout, cache):
    X, W, b, stride, padding, X_col = cache
    n_filter, d_filter, h_filter, w_filter = W.shape

    db = np.sum(dout, axis=(0, 2, 3))
    db = db.reshape(n_filter, -1)

    dout = dout.transpose(1, 2, 3, 0).reshape(n_filter, -1)
    dW = dout @ X_col.T
    dW = dW.reshape(W.shape)

    W = W.reshape(n_filter, -1)
    dX_col = W.T @ dout
    dX = col2im_indices(dX_col, X.shape, h_filter, w_filter, padding=padding, stride=stride)

    return dX, dW, db

In [12]:
# Pre-processing
def prepro(X_train, X_val, X_test):
    mean = np.mean(X_train)
    # scale = 255. - mean # std or sqrt(var), 255 == 2**8 or 8 bit grayscale
    # return (X_train - mean)/ scale, (X_val - mean)/ scale, (X_test - mean) / scale
    return X_train - mean, X_val - mean, X_test - mean

In [9]:
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
X_train, X_val, X_test = prepro(X_train, X_val, X_tes
img_shape = (1, 28, 28)
img_shape[:]
X_train = X_train.reshape(-1, *img_shape) # (-1, img_shape[0], img_shape[1], img_shape[2])
X_val = X_val.reshape(-1, *img_shape)
X_test = X_test.reshape(-1, *img_shape)
X_train.shape, X_val.shape, X_test.shape
# X_train[0, :10, :10, :10]
M, D, C


Out[9]:
(55000, 784, 10)

In [10]:
# Model
import impl.layer as l # or from impl.layer import *
from impl.loss import * # import all functions from impl.loss file # import impl.loss as loss_func
from sklearn.utils import shuffle as skshuffle

class CNN:

    def __init__(self, D, C, H, L, p_dropout, lam):
        self.mode = 'classification'
        self.L = L # number of layers or depth
        self.p_dropout = p_dropout
        self.lam = lam
        self.losses = {'train':[], 'smooth train':[], 'valid':[], 'valid_acc':[]}
        
        # Model parameters: weights and biases
        # Input layer of Conv
        self.model = []
        self.model.append(dict(
            W1=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.),
            b1=np.zeros((H, 1)),
        ))
        
        # Hidden layers of Conv-bn-relu-dropout
        m = []
        for _ in range(self.L):
            m.append(dict(
                W2=np.random.randn(H, H, 3, 3) / np.sqrt(H / 2.),
                b2=np.zeros((H, 1)),
            ))
        self.model.append(m) # self.model[0][]
        
        # Output layer of FC to output
        self.model.append(dict(
            W3=np.random.randn(H*D, C) / np.sqrt(H*D / 2.),
            b3=np.zeros((1, C))
        ))

    def forward(self, X, train):
        # 1st layer - Input layer: X
        X, X_conv_cache = conv_forward(X=X, W=self.model[0]['W1'], b=self.model[0]['b1'])
        X_cache = X_conv_cache

        # 2nd layers - Hidden layers: h
        h_cache = []
        for layer in range(self.L):
            h, h_conv_cache = conv_forward(X=X, W=self.model[1][layer]['W2'], b=self.model[1][layer]['b2'])
            h, h_nl_cache = l.selu_forward(X=h)
            h += X # residual connection
            if train: 
                # h_do_cache = None # ERROR: referenced before assigned?
                h, h_do_cache = l.selu_dropout_forward(h=h, q=self.p_dropout)
                cache = (h_conv_cache, h_nl_cache, h_do_cache)
            else:
                cache = (h_conv_cache, h_nl_cache)
            h_cache.append(cache)
            
        # 3rd layer - Output layer: y
        y = h.reshape([X.shape[0], -1]) # flattening
        y, y_fc_cache = l.fc_forward(X=y, W=self.model[2]['W3'], b=self.model[2]['b3'])
        y_cache = X, y_fc_cache

        cache = (X_cache, h_cache, y_cache)
        
        return y, cache

    def loss_function(self, y, y_train):
        loss = cross_entropy_reg(self.model[2], y, y_train, lam=self.lam)
        dy = dcross_entropy(y, y_train)
        return loss, dy
    
    def backward(self, dy, cache):
        X_cache, h_cache, y_cache = cache

        # 3rd layer: Ouput layer y
        X, y_fc_cache = y_cache
        dy, dw3, db3 = l.fc_backward(dout=dy, cache=y_fc_cache)
        dy = dy.reshape([-1, *X.shape[1:4]])
        
        # 2nd layers: Hidden layers h
        g = []
        for layer in reversed(range(self.L)):
            # if train: There is no backward in testing/prediction
            h_conv_cache, h_nl_cache, h_do_cache = h_cache[layer]
            dy = l.selu_dropout_backward(dout=dy, cache=h_do_cache)
            dh = l.selu_backward(dout=dy, cache=h_nl_cache)
            dh, dw2, db2 = conv_backward(dout=dh, cache=h_conv_cache)
            dh += dy
            g.append(dict(
                    W2=dw2,
                    b2=db2
                    ))
            
        # 1st layer: Input layer X
        X_conv_cache = X_cache
        dX, dw1, db1 = conv_backward(dout=dh, cache=X_conv_cache)
        # dX: TODO: hast not been used but this basically should be 0 
        # which means input can be perfectly recontructed!
        # dX is the grad_input or delta for input or the calculated error or difference or delta
        # Can be used as noise which is because it is unwanted and can be added to data to be calculated again
        # when the data is not abundantly available!

        # grad for GD
        grad = []
        
        # Input layer to conv layer
        grad.append(dict(
            W1=dw1, 
            b1=db1
        ))
        
        # Hidden layers of conv-bn-nl/relu-dropout/do
        grad.append(g)
        
        # Output later to FC layer
        grad.append(dict(
            W3=dw3, 
            b3=db3
        ))
        
        return dX, grad
    
    def test(self, X):
        y_logit, cache = self.forward(X, train=False)
        y_prob = l.softmax(y_logit)
        # if self.mode == 'classification':
        y_pred = np.argmax(y_prob, axis=1)
        # else: # self.mode == 'regression'
        # return np.round(y_logit)
        # y_prob for accuracy & y_logit for loss
        return y_pred, y_logit
        
    def get_minibatch(self, X, y, minibatch_size, shuffle):
        minibatches = []

        if shuffle:
            X, y = skshuffle(X, y)

        for i in range(0, X.shape[0], minibatch_size):
            X_mini = X[i:i + minibatch_size]
            y_mini = y[i:i + minibatch_size]
            minibatches.append((X_mini, y_mini))

        return minibatches

    def adam(self, train_set, val_set, alpha, mb_size, n_iter, print_after):
        X_train, y_train = train_set
        # if val_set:
        X_val, y_val = val_set

        M, R = [], []
        M.append({key: np.zeros_like(val) for key, val in self.model[0].items()})
        R.append({key: np.zeros_like(val) for key, val in self.model[0].items()})

        M_, R_ = [], []
        for layer in range(self.L):
            M_.append({key: np.zeros_like(val) for key, val in self.model[1][layer].items()})
            R_.append({key: np.zeros_like(val) for key, val in self.model[1][layer].items()})
        M.append(M_)
        R.append(R_)

        M.append({key: np.zeros_like(val) for key, val in self.model[2].items()})
        R.append({key: np.zeros_like(val) for key, val in self.model[2].items()})

        # Learning decay
        beta1 = .9
        beta2 = .99
        smooth_train = 1.

        # Epochs
        for iter in range(1, n_iter + 1):

            # Train the model
            # Minibatches
            #         """
            #         Single training step over minibatch: forward, loss, backprop
            #         """
            # Shuffle for each epochs/ stochasticity/ randomly choosing
            #             for idx in range(len(minibatches)):
            #             for _ in range(10):
            # Shuffle in every iteration
            # The dataset is static and non-sequentiol: no time-dependency or temporal pattern
            minibatches = self.get_minibatch(X_train, y_train, mb_size, shuffle=True)
            idx = np.random.randint(0, len(minibatches))
            X_mini, y_mini = minibatches[idx]
            y, cache = self.forward(X_mini, train=True)
            loss, dy = self.loss_function(y, y_mini)
            _, grad = self.backward(dy, cache)
            self.losses['train'].append(loss)
            smooth_train = (0.999 * smooth_train) + (0.001 * loss)
            self.losses['smooth train'].append(smooth_train)
            
            # Update the model
            for key in grad[0].keys():
                M[0][key] = l.exp_running_avg(M[0][key], grad[0][key], beta1)
                R[0][key] = l.exp_running_avg(R[0][key], grad[0][key]**2, beta2)

                m_k_hat = M[0][key] / (1. - (beta1**(iter)))
                r_k_hat = R[0][key] / (1. - (beta2**(iter)))

                self.model[0][key] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)

            for layer in range(self.L):
                for key in grad[1][layer].keys():
                    M[1][layer][key] = l.exp_running_avg(M[1][layer][key], grad[1][layer][key], beta1)
                    R[1][layer][key] = l.exp_running_avg(R[1][layer][key], grad[1][layer][key]**2, beta2)

                    m_k_hat = M[1][layer][key] / (1. - (beta1**(iter)))
                    r_k_hat = R[1][layer][key] / (1. - (beta2**(iter)))

                    self.model[1][layer][key] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)

            for key in grad[2].keys():
                M[2][key] = l.exp_running_avg(M[2][key], grad[2][key], beta1)
                R[2][key] = l.exp_running_avg(R[2][key], grad[2][key]**2, beta2)

                m_k_hat = M[2][key] / (1. - (beta1**(iter)))
                r_k_hat = R[2][key] / (1. - (beta2**(iter)))

                self.model[2][key] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)

            # Test the updated model to validate the model
            # Avoid overfitting/ memorizing and underfitting lack of model capacity
            # if val_set:
            y_pred, y_logit = self.test(X_val)
            valid_loss, _ = self.loss_function(y_logit, y_val) # softmax is included in entropy loss function
            self.losses['valid'].append(valid_loss)
            # def accuracy(y_true, y_pred):
            # return np.mean(y_pred == y_true)
            valid_acc = np.mean(y_pred == y_val)
            self.losses['valid_acc'].append(valid_acc)
            
            # Print the model info: loss & accuracy or err & acc
            if iter % print_after == 0:
                print('Iter-{} train loss: {:.4f} valid loss: {:.4f}, valid accuracy: {:.4f}'.format(
                    iter, loss, valid_loss, valid_acc))
                
        # Test the model after training and validation after all the epochs
        # The test data has NOT been used before and has NOT been seen by the model
        # # Kernel dead problem sometimes!
        y_pred, _ = nn.test(X_test)
        acc = np.mean(y_pred == y_test)
        print('Last iteration - Test accuracy mean: {:.4f}, std: {:.4f}'.format(acc.mean(), acc.std()))

In [11]:
# Hyper-parameters
n_iter = 10 # number of epochs
alpha = 1e-4 # learning_rate
mb_size = 64 # 2**10==1024 # width, timestep for sequential data or minibatch size
num_layers = 10 # depth 
print_after = 1 # n_iter//10 # print loss for train, valid, and test
num_hidden_units = 10 # number of kernels/ filters in each layer
num_input_units = D # noise added at the input lavel as input noise we can use dX or for more improvement
num_output_units = C # number of classes in this classification problem
p_dropout = 0.95 #  layer & unit noise: keep_prob = p_dropout, q = 1-p, 0.95 or 0.90 by default, noise at the network level or layers
lam = 1e-3 # output noise: reg at the feedback or loss function or function loss level as noise or loss_reg added.

# Build the model/NN and learn it: running session.
nn = CNN(C=num_output_units, D=num_input_units, H=num_hidden_units, p_dropout=p_dropout, L=num_layers, lam=lam)

nn.adam(train_set=(X_train, y_train), val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
           n_iter=n_iter, print_after=print_after)


Iter-1 train loss: 4.4526 valid loss: 4.3208, valid accuracy: 0.1414
Iter-2 train loss: 4.4631 valid loss: 3.9129, valid accuracy: 0.1614
Iter-3 train loss: 3.7748 valid loss: 3.5714, valid accuracy: 0.1886
Iter-4 train loss: 3.9628 valid loss: 3.2501, valid accuracy: 0.2120
Iter-5 train loss: 2.8623 valid loss: 2.9981, valid accuracy: 0.2346
Iter-6 train loss: 3.0711 valid loss: 2.8140, valid accuracy: 0.2554
Iter-7 train loss: 2.9600 valid loss: 2.6584, valid accuracy: 0.2770
Iter-8 train loss: 2.8835 valid loss: 2.5278, valid accuracy: 0.2958
Iter-9 train loss: 2.4488 valid loss: 2.3992, valid accuracy: 0.3186
Iter-10 train loss: 2.4413 valid loss: 2.2714, valid accuracy: 0.3394
Last iteration - Test accuracy mean: 0.3515, std: 0.0000

In [13]:
# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(nn.losses['train'], label='Train loss')
# plt.plot(nn.losses['smooth train'], label='Train smooth loss')
plt.plot(nn.losses['valid'], label='Valid loss')
plt.legend()
plt.show()



In [14]:
plt.plot(nn.losses['valid_acc'], label='Valid accuracy')
plt.legend()
plt.show()



In [ ]: