In [8]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from IPython.html.widgets import interact

from sklearn.datasets import load_digits
digits = load_digits()

In [9]:
"""This part defines the sigmoid functions, which ae used to calculate the
output of each neuron. At large positive and negative x, it approximates the step
function, but curves near zero."""

def sigmoid(x):
    return 1/(1 + np.exp(-x))

sigmoid_v = np.vectorize(sigmoid)

def sigmoidprime(x):
    return sigmoid(x) * (1 - sigmoid(x))

sigmoidprime_v = np.vectorize(sigmoidprime)

In [10]:
x = np.linspace(-15, 15, 100)
y = sigmoid(x)
plt.plot(x, y)
plt.ylim(-0.3, 1.3)


Out[10]:
(-0.3, 1.3)

In [11]:
"""This section created random initial weights and biases. It also takes the
data and answers from the source, and converts the answers into 10-d vectors

ex. 6 --> [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"""

size = [64, 20, 10]

weights = []
for n in range(1, len(size)):
    weights.append(np.random.rand(size[n], size[n-1]) * 2 - 1)

biases = []
for n in range(1, len(size)):
    biases.append(np.random.rand(size[n]) * 2 - 1)

trainingdata = digits.data[0:1200]
traininganswers = digits.target[0:1200]
lc = 0.02

#convert the integer answers into a 10-dimension array
traininganswervectors = np.zeros((1796,10))
for n in range(1796):
    traininganswervectors[n][digits.target[n]] = 1

In [12]:
"""This function calculates the output of the network based on the weights and
biases, and uses the sigmoid function."""
def feedforward(weights, biases, a):
    b = []
    #first element is inputs "a"
    b.append(a)
    for n in range(1, len(size)):
        #all other elements depend on the number of neurons
        b.append(np.zeros(size[n]))
        for n2 in range(0, size[n]):
            b[n][n2] = sigmoid_v(np.dot(weights[n-1][n2], b[n-1]) + biases[n-1][n2])
      
    return b

In [13]:
feedforward(weights, biases, trainingdata[0])


Out[13]:
[array([  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.,   0.,   0.,  13.,
         15.,  10.,  15.,   5.,   0.,   0.,   3.,  15.,   2.,   0.,  11.,
          8.,   0.,   0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.,   0.,
          5.,   8.,   0.,   0.,   9.,   8.,   0.,   0.,   4.,  11.,   0.,
          1.,  12.,   7.,   0.,   0.,   2.,  14.,   5.,  10.,  12.,   0.,
          0.,   0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]),
 array([  3.21682303e-06,   7.51067561e-02,   4.85302615e-02,
          9.44295088e-01,   1.00000000e+00,   5.60922030e-19,
          9.99995209e-01,   2.80763966e-17,   2.17687836e-21,
          1.00000000e+00,   1.00000000e+00,   9.98164651e-01,
          2.55007164e-16,   9.78797916e-06,   1.00000000e+00,
          2.91803266e-19,   1.00000000e+00,   9.99998105e-01,
          1.00000000e+00,   3.79032818e-23]),
 array([ 0.93938143,  0.85755443,  0.90042992,  0.54905622,  0.49554056,
         0.64940774,  0.21117831,  0.02616924,  0.57829814,  0.0189328 ])]

In [14]:
"""This function just calculates the difference between the answer the network
provides and the given answer for an input for each element of the answer"""

def costderivative(output, answers):
    return (output - answers)

In [15]:
"""This function is used to find the 'minimum' of the cost derivative. It selects
a 'minibatch' of random inputs from the training set to approximate the behavior
of the whole set."""

def gradient_descent(weights, biases, inputs, answers, batchsize, lc, epochs):
    for n in range(epochs):
        #pick random locations for input/result data
        locations = np.random.randint(0, len(inputs), batchsize)
        minibatch = []
        #create tuples (inputs, result) based on random locations
        for n2 in range(batchsize):
            minibatch.append((inputs[locations[n2]], answers[locations[n2]]))
        for n3 in range(batchsize):
            weights, biases = train(weights, biases, minibatch, lc)
        
        
        results = []
        for n4 in range(len(trainingdata)):
            results.append(feedforward(weights, biases, inputs[n4])[-1])
            
        accresult = accuracy(inputs, results, answers)
        print("Epoch ", n, " : ", accresult)
        
    return weights, biases

In [16]:
"""This function adjusts the weights and biases based on the results of the
backpropagation function"""

def train(weights, biases, minibatch, lc):
    #set the nabla functions to be the functions themselves initially, same size
    nb = [np.zeros(b.shape) for b in biases]
    nw = [np.zeros(w.shape) for w in weights]
    #largely taken from Michael Nielsen's implementation
    for i, r in minibatch:
        dnb, dnw = backprop(weights, biases, i, r)
        nb = [a+b for a, b in zip(nb, dnb)]
        nw = [a+b for a, b in zip(nw, dnw)]
    
    weights = [w-(lc/len(minibatch))*n_w for w, n_w in zip(weights, nw)]
    biases = [b-(lc/len(minibatch))*n_b for b, n_b in zip(biases, nb)]
    return weights, biases

In [17]:
"""This function is the most complex, and likely where I made an error. It 
calculates the gradient of the change for the output, then calculates it for
each step before it (hence BACKpropagation)."""

def backprop(weights, biases, inputs, answers):
    #set the nabla functions to be the same size as functions
    nb = [np.zeros(b.shape) for b in biases]
    nw = [np.zeros(w.shape) for w in weights]
    a = inputs
    alist = [inputs]
    zlist = []
    #from feedforward
    for n in range(1, len(size)):
        #all other elements depend on the number of neurons
        zlist.append(np.zeros(size[n]))
        alist.append(np.zeros(size[n]))
        for n2 in range(0, size[n]):
            zlist[n-1][n2] = np.dot(weights[n-1][n2], alist[n-1]) + biases[n-1][n2]
            alist[n][n2] = sigmoid_v(zlist[n-1][n2])
    
    delta = costderivative(alist[-1], answers) * sigmoidprime_v(zlist[-1])
    nb[-1] = delta
    #different from MN, alist[-2] not same size as delta?
    nw[-1] = np.dot(delta, alist[-1].transpose())
    
    for n in range(2, len(size)):
        delta = np.dot(weights[-n+1].transpose(), delta) * sigmoidprime_v(zlist[-n])
        nb[-n] = delta
        #same here
        nw[-n] = np.dot(delta, alist[-n].transpose())
    
    return nb, nw

In [18]:
"""This function is just used to test the accuracy of each epoch. It converts the
outputs into a vector like the answers, taking the largest value to be the '1'

ex [0.21, 0.06, 0.134, 0.952, 0.558, 0.031, 0.511, 0.105, 0.216, 0.041] -->
    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] == 3"""

def accuracy(inputs, results, answers):
    correct = 0
    binresults = results
    for n in range(0, len(results)):
        #converts the output into a binary y/n for each digit
        for n2 in range(len(results[n])):
            if results[n][n2] == np.amax(results[n]):
                binresults[n][n2] = 1
            else:
                binresults[n][n2] = 0
        
        if np.array_equal(answers[n], binresults[n]):
            correct += 1
    return correct / len(results)

In [19]:
size = [64, 20, 10]

weights = []
for n in range(1, len(size)):
    weights.append(np.random.rand(size[n], size[n-1]) * 2 - 1)

biases = []
for n in range(1, len(size)):
    biases.append(np.random.rand(size[n]) * 2 - 1)

trainingdata = digits.data[0:500]
traininganswers = digits.target[0:500]

traininganswervectors = np.zeros((500,10))
for n in range(500):
    traininganswervectors[n][digits.target[n]] = 1

In [20]:
final_weights, final_biases = gradient_descent(weights, biases, trainingdata,
                                              traininganswervectors, 5, 1, 100)


Epoch  0  :  0.122
Epoch  1  :  0.122
Epoch  2  :  0.134
Epoch  3  :  0.146
Epoch  4  :  0.11
Epoch  5  :  0.136
Epoch  6  :  0.136
Epoch  7  :  0.124
Epoch  8  :  0.116
Epoch  9  :  0.136
Epoch  10  :  0.124
Epoch  11  :  0.104
Epoch  12  :  0.118
Epoch  13  :  0.104
Epoch  14  :  0.108
Epoch  15  :  0.106
Epoch  16  :  0.102
Epoch  17  :  0.092
Epoch  18  :  0.09
Epoch  19  :  0.09
Epoch  20  :  0.09
Epoch  21  :  0.096
Epoch  22  :  0.098
Epoch  23  :  0.1
Epoch  24  :  0.096
Epoch  25  :  0.108
Epoch  26  :  0.104
Epoch  27  :  0.1
Epoch  28  :  0.1
Epoch  29  :  0.1
Epoch  30  :  0.108
Epoch  31  :  0.098
Epoch  32  :  0.124
Epoch  33  :  0.1
Epoch  34  :  0.098
Epoch  35  :  0.092
Epoch  36  :  0.106
Epoch  37  :  0.104
Epoch  38  :  0.104
Epoch  39  :  0.1
Epoch  40  :  0.11
Epoch  41  :  0.108
Epoch  42  :  0.102
Epoch  43  :  0.1
Epoch  44  :  0.092
Epoch  45  :  0.096
Epoch  46  :  0.104
Epoch  47  :  0.102
Epoch  48  :  0.11
Epoch  49  :  0.126
Epoch  50  :  0.112
Epoch  51  :  0.11
Epoch  52  :  0.112
Epoch  53  :  0.12
Epoch  54  :  0.124
Epoch  55  :  0.102
Epoch  56  :  0.094
Epoch  57  :  0.104
Epoch  58  :  0.106
Epoch  59  :  0.1
Epoch  60  :  0.104
Epoch  61  :  0.084
Epoch  62  :  0.068
Epoch  63  :  0.066
Epoch  64  :  0.07
Epoch  65  :  0.078
Epoch  66  :  0.1
Epoch  67  :  0.098
Epoch  68  :  0.098
Epoch  69  :  0.08
Epoch  70  :  0.08
Epoch  71  :  0.082
Epoch  72  :  0.08
Epoch  73  :  0.072
Epoch  74  :  0.072
Epoch  75  :  0.072
Epoch  76  :  0.07
Epoch  77  :  0.07
Epoch  78  :  0.072
Epoch  79  :  0.082
Epoch  80  :  0.078
Epoch  81  :  0.08
Epoch  82  :  0.092
Epoch  83  :  0.11
Epoch  84  :  0.084
Epoch  85  :  0.068
Epoch  86  :  0.062
Epoch  87  :  0.066
Epoch  88  :  0.102
Epoch  89  :  0.102
Epoch  90  :  0.1
Epoch  91  :  0.1
Epoch  92  :  0.1
Epoch  93  :  0.1
Epoch  94  :  0.098
Epoch  95  :  0.066
Epoch  96  :  0.072
Epoch  97  :  0.048
Epoch  98  :  0.076
Epoch  99  :  0.076