In [ ]:
'''
http://iamtrask.github.io/2015/11/15/anyone-can-code-lstm/
'''

In [1]:
import copy, numpy as np

In [2]:
np.random.seed(0)

In [3]:
def sigmoid(x):
    output = 1 / (1 + np.exp(-x))
    return output

def sigmoid_output_to_derivative(output):
    return output * (1 - output)

In [4]:
# training dataset generation
int2binary = {}
binary_dim = 8

largest_number = pow(2, binary_dim)
binary = np.unpackbits(np.array([range(largest_number)], dtype=np.uint8).T, axis = 1)
for i in range(largest_number):
    int2binary[i] = binary[i]

In [5]:
pow(2, binary_dim)


Out[5]:
256

In [6]:
np.array([range(256)], dtype=np.uint8)


Out[6]:
array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
        182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
        195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
        208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
        221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
        234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
        247, 248, 249, 250, 251, 252, 253, 254, 255]], dtype=uint8)

In [7]:
np.unpackbits(np.array([range(256)], dtype=np.uint8).T, axis = 1)


Out[7]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ..., 
       [1, 1, 1, ..., 1, 0, 1],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)

In [8]:
# input variables
alpha = 0.1
input_dim = 2
hidden_dim = 16
output_dim = 1

In [20]:
# initialize neural network weights
syn_0 = 2 * np.random.random((input_dim, hidden_dim)) - 1
syn_1 = 2 * np.random.random((hidden_dim, output_dim)) - 1
syn_h = 2 * np.random.random((hidden_dim, hidden_dim)) - 1

In [21]:
syn_0_update = np.zeros_like(syn_0)
syn_1_update = np.zeros_like(syn_1)
syn_h_update = np.zeros_like(syn_h)

In [22]:
# training logic:
for j in range(10001):
    # generate a simple addition problem (a + b = c)
    a_int = np.random.randint(largest_number / 2)
    a = int2binary[a_int] # binary encoding
    
    b_int = np.random.randint(largest_number / 2)
    b = int2binary[b_int]
    
    c_int = a_int + b_int
    c = int2binary[c_int]
    
    # where we'll store our best guess (binary encoded)
    d = np.zeros_like(c)
    
    overallError = 0
    
    layer_2_deltas = list()
    layer_1_values = list()
    layer_1_values.append(np.zeros(hidden_dim))
    
    # moving along the positions in the binary encoding
    for position in range(binary_dim):
        # generate input and output
        X = np.array([[a[binary_dim - position - 1], b[binary_dim - position - 1]]])
        y = np.array([[c[binary_dim - position - 1]]]).T
        
        # hidden layer (input + prev_hidden)
        layer_1 = sigmoid(np.dot(X, syn_0) + np.dot(layer_1_values[-1], syn_h))
        
        #output layer
        layer_2 = sigmoid(np.dot(layer_1, syn_1))
        
        # error
        cost = np.sum(np.square(y - layer_2))/2
        layer_2_error = layer_2 - y
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)
        layer_2_deltas.append(layer_2_delta)
        overallError += cost
        
        # decode estimate so we could print it out
        d[binary_dim - position - 1] = np.round(layer_2[0][0])
        
        #store hidden layer so we could use it in the ndex timestamp
        layer_1_values.append(copy.deepcopy(layer_1))
        
    future_layer_1_delta = np.zeros(hidden_dim)
    
    for position in range(binary_dim):
        
        X = np.array([[a[position], b[position]]])
        layer_1 = layer_1_values[-1 - position]
        pre_layer_1 = layer_1_values[-1 -position -1]
        
        # error at output layer
        layer_2_delta = layer_2_deltas[-1 - position]
        
        # error at hidden layer
        # 1. hidden layer (layer_1) passed to output layer
        # 2. hidden layer (layer_1) also passed to hidden layer itself in next timestamp
        layer_1_delta = (layer_2_delta.dot(syn_1.T) + future_layer_1_delta.dot(syn_h.T)) * sigmoid_output_to_derivative(layer_1)
        
        syn_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
        
        syn_h_update += np.atleast_2d(pre_layer_1).T.dot(layer_1_delta)
        syn_0_update += X.T.dot(layer_1_delta)
        
        future_layer_1_delta = layer_1_delta
        
    syn_0 -= syn_0_update * alpha
    syn_1 -= syn_1_update * alpha
    syn_h -= syn_h_update * alpha
    
    syn_0_update *= 0
    syn_1_update *= 0
    syn_h_update *= 0
    
    # print out progress
    if (j % 2000 == 0):
        print("Error: ", overallError)
        print("Pred: ", d)
        print("True: ", c)
        out = 0
        for index, x in enumerate(reversed(d)):
            out += x * pow(2, index)
        print(a_int, " + ", b_int, " = ", out)
        print("----------")


Error:  1.04679338382
Pred:  [0 0 0 0 0 0 0 1]
True:  [0 1 1 1 1 0 0 1]
43  +  78  =  1
----------
Error:  1.11273823275
Pred:  [0 1 1 1 1 1 0 1]
True:  [1 0 0 1 0 1 0 0]
22  +  126  =  125
----------
Error:  0.847013893305
Pred:  [0 1 0 1 0 0 0 0]
True:  [0 1 0 1 1 0 0 1]
41  +  48  =  80
----------
Error:  0.230761729471
Pred:  [1 0 0 0 0 0 1 0]
True:  [1 0 0 0 0 0 1 0]
95  +  35  =  130
----------
Error:  0.0128427033786
Pred:  [0 1 1 1 0 1 1 0]
True:  [0 1 1 1 0 1 1 0]
9  +  109  =  118
----------
Error:  0.0037268223704
Pred:  [0 1 0 1 0 1 1 0]
True:  [0 1 0 1 0 1 1 0]
81  +  5  =  86
----------