In [1]:
# Just various neural networks and associated techniques in a modular fashion utilizing matrix notation.
# Nothing groundbreaking; just for exercising.

In [2]:
import numpy as np

In [14]:
SIGMA_INIT = 0.001

class Activation(object):
    def __init__(self, f, f_deriv):
        self.f = f
        self.f_deriv = f_deriv

class Layer(object):
    def __init__(self, n, activation, bias):
        self.n = n
        self.W = None
        self.act = activation
        self.bias = bias

class Loss(object):
    def __init__(self, loss_function, loss_function_deriv):
        self.f = loss_function
        self.f_deriv = loss_function_deriv

class Trainer(object):
    def __init__(self, learning_rate):
        self.lr = learning_rate
    def update(self):
        raise Exception("Use implementing subclass.")

class SGD(Trainer):
    def __init__(self, learning_rate, momentum):
        super(SGD, self).__init__(learning_rate)
        self.momentum = momentum
    def update(self):
        for i, layer in enumerate(self.model.layers):
            if self.deltas[i] != None:
                layer.W -= self.momentum * self.deltas[i]
            # consider bias
            imax = layer.W.shape[0]
            delta = self.lr * np.dot(layer.deltas[:imax], layer._in.T)
            delta /= layer.deltas.shape[1] 
            layer.W -= delta
            self.deltas[i] = delta
        
# TODO: Change to in place operations for optimization where possible.
class Model(object):
    def __init__(self, n_in):
        self.n_in = n_in
        self.layers = []
        self.loss = None
        self.trainer = None
    def add(self, layer):
        self.layers.append(layer)
    def prepare(self, loss, trainer):
        self.nl = len(self.layers)
        assert self.nl > 0
        n_in = self.n_in
        for i in xrange(self.nl):
            layer = self.layers[i]
            if layer.bias:
                n_in += 1
            layer.W = SIGMA_INIT * np.random.randn(layer.n, n_in)
            n_in = layer.n
        self.loss = loss
        self.trainer = trainer
        trainer.model = self
        trainer.deltas = [None for i in xrange(len(self.layers))]
    def forward(self, X):
        buf = X.T
        for i in xrange(self.nl):
            layer = self.layers[i]
            if layer.bias:
                buf = np.append(buf, np.ones((1, buf.shape[1])), axis=0)
            layer._in = buf.copy()
            buf = np.dot(layer.W, buf)
            layer._out = buf.copy()
            buf = layer.act.f(buf)
        return buf.T
    def backward(self, Y_pred, Y_target):
        assert self.loss
        assert self.trainer
        prev = None
        for i in xrange(self.nl - 1, -1, -1):
            layer = self.layers[i]
            if not prev:
                err = self.loss.f_deriv(Y_pred, Y_target).T
            else:
                err = np.dot(prev.W.T, prev.deltas)
            # consider bias
            if layer._out.shape[0] < err.shape[0]:
                out = np.append(layer._out, np.ones((1, err.shape[1])), axis=0)
            else:
                out = layer._out
            err *= layer.act.f_deriv(out)
            err = np.clip(err, -500, 500)
            layer.deltas = err.copy()
            prev = layer
    def _forward_backward(self, X, Y_target):
        Y_pred = self.forward(X)
        self.backward(Y_pred, Y_target)
        self.trainer.update()
    def train_epoch(self, X, Y, batch_size=1, shuffle=True):
        if shuffle:
            p = np.random.permutation(X.shape[0])
            X = X[p]
            Y = Y[p]
        if batch_size < 1:
            batch_size = X.shape[0]
        for i in xrange(0, X.shape[0], batch_size):
            self._forward_backward(X[i:min(i+batch_size,X.shape[0])], Y[i:min(i+batch_size,X.shape[0])])

In [21]:
# MLP - Regression

N_IN = 2
N_HIDDEN = 4
N_OUT = 1
N_SAMPLES = 2000
LEARNING_RATE = 0.05
MOMENTUM = 0.9
N_EPOCHS = 300
BATCH_SIZE = 1

X = np.random.rand(N_SAMPLES, N_IN) * 0.5
Y = np.zeros((N_SAMPLES, N_OUT))
Y = np.sum(X, axis=1)**2
Y = Y[:,np.newaxis]
print Y.shape

print X[0],Y[0],Y[0]-(X[0,0]+X[0,1])**2

thresh = int(0.8 * N_SAMPLES)
X_train = X[:thresh]
Y_train = Y[:thresh]
X_test = X[thresh:]
Y_test = Y[thresh:]

def sigmoid(x):
    val = 1. / (1. + np.exp(-x))
    return np.clip(val, -500, 500)
    
logistic = Activation(lambda X: sigmoid(X), lambda Y: sigmoid(Y) * (1. - sigmoid(Y)))
identity = Activation(lambda X: X, lambda Y: Y)

mse = Loss(
    lambda Y_pred, Y_target: np.sum((Y_pred - Y_target)**2, axis=1), 
    lambda Y_pred, Y_target: Y_pred - Y_target
)

sgd = SGD(LEARNING_RATE, MOMENTUM)

hidden = Layer(N_HIDDEN, logistic, bias=True)
hidden2 = Layer(N_HIDDEN/2, logistic, bias=True)
output = Layer(N_OUT, identity, bias=False)

model = Model(N_IN)
model.add(hidden)
model.add(hidden2)
model.add(output)
model.prepare(loss=mse, trainer=sgd)

for i in range(N_EPOCHS):
    model.train_epoch(X_train, Y_train, batch_size=BATCH_SIZE, shuffle=True)
    Y_train_pred = model.forward(X_train)
    Y_test_pred = model.forward(X_test)
    if not (i+1) % 10:
        print "Epoch %5i" % (i+1), "-", "Train Loss: %5.3f" % np.sum(mse.f(Y_train_pred, Y_train), axis=0), \
            "Test  Loss: %5.3f" % np.sum(mse.f(Y_test_pred, Y_test), axis=0)
        
print X_test[:5]
print Y_test[:5]
Y_pred = model.forward(X_test[:5])
print Y_pred
print X_train[:5]
print Y_train[:5]
Y_pred = model.forward(X_train[:5])
print Y_pred


(2000L, 1L)
[ 0.46607615  0.39754986] [ 0.74584989] [ 0.]
C:\Anaconda\lib\site-packages\ipykernel\__main__.py:32: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
Epoch    10 - Train Loss: 70.666 Test  Loss: 15.889
Epoch    20 - Train Loss: 70.993 Test  Loss: 15.957
Epoch    30 - Train Loss: 70.663 Test  Loss: 15.879
Epoch    40 - Train Loss: 70.506 Test  Loss: 15.845
Epoch    50 - Train Loss: 70.534 Test  Loss: 15.858
Epoch    60 - Train Loss: 70.309 Test  Loss: 15.800
Epoch    70 - Train Loss: 69.987 Test  Loss: 15.736
Epoch    80 - Train Loss: 68.875 Test  Loss: 15.483
Epoch    90 - Train Loss: 65.688 Test  Loss: 14.776
Epoch   100 - Train Loss: 56.014 Test  Loss: 12.635
Epoch   110 - Train Loss: 35.479 Test  Loss: 8.267
Epoch   120 - Train Loss: 10.098 Test  Loss: 2.389
Epoch   130 - Train Loss: 2.819 Test  Loss: 0.653
Epoch   140 - Train Loss: 1.269 Test  Loss: 0.279
Epoch   150 - Train Loss: 0.880 Test  Loss: 0.187
Epoch   160 - Train Loss: 0.765 Test  Loss: 0.160
Epoch   170 - Train Loss: 0.705 Test  Loss: 0.147
Epoch   180 - Train Loss: 0.683 Test  Loss: 0.142
Epoch   190 - Train Loss: 0.670 Test  Loss: 0.140
Epoch   200 - Train Loss: 0.643 Test  Loss: 0.134
Epoch   210 - Train Loss: 0.626 Test  Loss: 0.130
Epoch   220 - Train Loss: 0.612 Test  Loss: 0.127
Epoch   230 - Train Loss: 0.601 Test  Loss: 0.124
Epoch   240 - Train Loss: 0.585 Test  Loss: 0.121
Epoch   250 - Train Loss: 0.568 Test  Loss: 0.117
Epoch   260 - Train Loss: 0.555 Test  Loss: 0.115
Epoch   270 - Train Loss: 0.543 Test  Loss: 0.112
Epoch   280 - Train Loss: 0.531 Test  Loss: 0.109
Epoch   290 - Train Loss: 0.522 Test  Loss: 0.108
Epoch   300 - Train Loss: 0.509 Test  Loss: 0.105
[[ 0.33017932  0.10601213]
 [ 0.20585817  0.30130635]
 [ 0.40879558  0.00048482]
 [ 0.13491069  0.25447976]
 [ 0.34795987  0.00489321]]
[[ 0.19026298]
 [ 0.25721585]
 [ 0.16751044]
 [ 0.15162492]
 [ 0.12450529]]
[[ 0.19567597]
 [ 0.25526397]
 [ 0.17633012]
 [ 0.16369944]
 [ 0.14149953]]
[[ 0.46607615  0.39754986]
 [ 0.07040523  0.31275902]
 [ 0.03945223  0.30702401]
 [ 0.19582596  0.31092975]
 [ 0.23823065  0.108423  ]]
[[ 0.74584989]
 [ 0.14681484]
 [ 0.12004578]
 [ 0.25680134]
 [ 0.12016876]]
[[ 0.7505005 ]
 [ 0.1598782 ]
 [ 0.13843041]
 [ 0.25490973]
 [ 0.13823297]]

In [5]:
# MLP - Classification


#TODO: logistic layer, softmax and classification examples

In [6]:
#TODO: RNN and BPTT

In [7]:
#TODO: LSTM

In [8]:
#TODO: CNN

In [9]:
#some other things: droput, regularization, batch normalization, weight decay, momentum, stopping criteria for trainer, 
# automatic minibatch sizing, ...

#out of scope: computational graph