Theano RNN



In [1]:

    
import theano
import os, sys
sys.path.insert(1, os.path.join('../utils'))
from utils import *
# from __future__ import division, print_functions









    



Using Theano backend.



In [2]:

    
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)









    



('corpus length:', 600901)
('total chars:', 86)



In [3]:

    
chars.insert(0, "\0")
''.join(chars[1:-6])









    Out[3]:





'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'



In [4]:

    
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
idx = [char_indices[c] for c in text]
# ''.join(indices_char[i] for i in idx[:70])



In [5]:

    
n_hidden, n_fac, cs = 256, 42, 8



In [6]:

    
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]
c_out_dat = [[idx[i+cs] for i in xrange(0, len(idx)-cs, cs)] for n in range(cs)]

xs = [np.stack(c[:-2]) for c in c_in_dat]
ys = [np.stack(c[:-2]) for c in c_out_dat]

So here's the trouble spot. First looking at the shape & visualizing xs, then seeing how we can get xs to be of the right shape



In [22]:

    
for row in [xs[n][:cs] for n in range(cs)]: print row
xs[0].shape









    



[40  1 33  2 72 67 73  2]
[42  1 38 44  2  9 61 73]
[29 43 31 71 54  9 58 61]
[30 45  2 74  2 76 67 58]
[25 40 73 73 76 61 24 71]
[27 40 61 61 68 54  2 58]
[29 39 54  2 66 73 33  2]
[ 1 43 73 62 54  2 72 67]






    Out[22]:





(75110,)

By trial & error found out axis=2 is the right axis to expand. And looks like it gives me exactly what I was looking for.



In [17]:

    
# xs2 = np.stack(np.squeeze(xs), axis=0)
xs2 = np.expand_dims(xs, axis=2)



In [18]:

    
xs2[0].shape









    Out[18]:





(75110, 1)



In [24]:

    
for row in [xs2[n][:cs] for n in range(cs)]: print row
xs2[0].shape









    



[[40]
 [ 1]
 [33]
 [ 2]
 [72]
 [67]
 [73]
 [ 2]]
[[42]
 [ 1]
 [38]
 [44]
 [ 2]
 [ 9]
 [61]
 [73]]
[[29]
 [43]
 [31]
 [71]
 [54]
 [ 9]
 [58]
 [61]]
[[30]
 [45]
 [ 2]
 [74]
 [ 2]
 [76]
 [67]
 [58]]
[[25]
 [40]
 [73]
 [73]
 [76]
 [61]
 [24]
 [71]]
[[27]
 [40]
 [61]
 [61]
 [68]
 [54]
 [ 2]
 [58]]
[[29]
 [39]
 [54]
 [ 2]
 [66]
 [73]
 [33]
 [ 2]]
[[ 1]
 [43]
 [73]
 [62]
 [54]
 [ 2]
 [72]
 [67]]






    Out[24]:





(75110, 1)



In [25]:

    
print(len(xs), len(xs2))









    



(8, 8)

Now to build an RNN in Theano and test it out. A successful result is a loss of around ~14.4, and an okay'ish prediction of the next character in an 8 character series.



In [51]:

    
# One Hot Encoding xs & ys:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn = np.stack(oh_ys, axis=1)

# seeing if there's a difference between the x & x2 versions
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn = np.stack(oh_xs, axis=1)

oh_xs2 = [to_categorical(o, vocab_size) for o in xs2]
oh_x2_rnn = np.stack(oh_xs, axis=1)

xs3 = [np.stack(x[:]) for x in xs2]
oh_xs3 = [to_categorical(o, vocab_size) for o in xs3]
oh_x3_rnn = np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_x2_rnn.shape, oh_x3_rnn.shape, oh_y_rnn.shape









    Out[51]:





((75110, 8, 86), (75110, 8, 86), (75110, 8, 86), (75110, 8, 86))



In [27]:

    
n_input = vocab_size
n_output = vocab_size

def init_wgts(rows, cols):
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows,cols)).astype(np.float32))
def init_bias(rows):
    return shared(np.zeros(rows, dtype=np.float32))
def wgts_and_bias(n_in, n_out):
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n):
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

# Theano Variables
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # Calculate the hidden activations
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # Calculate the output activations
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    # Return both (the `Flatten()` is to work around a Theano bug)
    return h, T.flatten(y, 1)



In [28]:

    
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
                             outputs_info=[t_h0, None], non_sequences=w_all)

error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

def upd_dict(wgts, grads, lr):
    return OrderedDict({w: w - g * lr for (w,g) in zip(wgts, grads)})

upd = upd_dict(w_all, g_all, lr)

# finally ready to compile the function
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)









    



/Users/WayNoxchi/Miniconda3/Theano/theano/tensor/basic.py:5130: UserWarning: flatten outdim parameter is deprecated, use ndim instead.
  "flatten outdim parameter is deprecated, use ndim instead.")



In [56]:

    
X = oh_x_rnn
X2 = oh_x2_rnn
X3 = oh_x3_rnn
Y = oh_y_rnn
X.shape, X2.shape, X3.shape, Y.shape









    Out[56]:





((75110, 8, 86), (75110, 8, 86), (75110, 8, 86), (75110, 8, 86))



In [57]:

    
# gonna run on X and X2 and X3 simultaneously 

err = 0.0; err2 = 0.0; err3 = 0.0; l_rate = 0.01
for i in xrange(len(X)):
    err += fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    err2 += fn(np.zeros(n_hidden), X2[i], Y[i], l_rate)
    err3 += fn(np.zeros(n_hidden), X3[i], Y[i], l_rate)
    if i % 1000 == 999:
        print ("ErrorX:{:.3f}   ErrorX2:{:.3f}   ErrorX2:{:.3f}".format(err/1000, err2/1000, err3/1000))
        err=0.0; err2=0.0; err3=0.0









    



ErrorX:26.444   ErrorX2:25.839   ErrorX2:25.239
ErrorX:25.633   ErrorX2:25.031   ErrorX2:24.435
ErrorX:26.118   ErrorX2:25.512   ErrorX2:24.913
ErrorX:25.408   ErrorX2:24.807   ErrorX2:24.213
ErrorX:25.354   ErrorX2:24.753   ErrorX2:24.159
ErrorX:25.901   ErrorX2:25.295   ErrorX2:24.695
ErrorX:25.221   ErrorX2:24.619   ErrorX2:24.023
ErrorX:25.274   ErrorX2:24.672   ErrorX2:24.076
ErrorX:25.171   ErrorX2:24.571   ErrorX2:23.978
ErrorX:25.891   ErrorX2:25.286   ErrorX2:24.687
ErrorX:25.283   ErrorX2:24.679   ErrorX2:24.081
ErrorX:25.175   ErrorX2:24.574   ErrorX2:23.979
ErrorX:26.283   ErrorX2:25.680   ErrorX2:25.083
ErrorX:25.173   ErrorX2:24.571   ErrorX2:23.975
ErrorX:25.074   ErrorX2:24.472   ErrorX2:23.877
ErrorX:26.220   ErrorX2:25.615   ErrorX2:25.016
ErrorX:26.562   ErrorX2:25.958   ErrorX2:25.361
ErrorX:25.915   ErrorX2:25.311   ErrorX2:24.714
ErrorX:25.131   ErrorX2:24.529   ErrorX2:23.933
ErrorX:25.093   ErrorX2:24.492   ErrorX2:23.896
ErrorX:24.690   ErrorX2:24.092   ErrorX2:23.500
ErrorX:25.668   ErrorX2:25.063   ErrorX2:24.464
ErrorX:25.267   ErrorX2:24.666   ErrorX2:24.072
ErrorX:24.826   ErrorX2:24.226   ErrorX2:23.633
ErrorX:25.325   ErrorX2:24.724   ErrorX2:24.128
ErrorX:25.232   ErrorX2:24.631   ErrorX2:24.036
ErrorX:25.642   ErrorX2:25.038   ErrorX2:24.440
ErrorX:25.699   ErrorX2:25.095   ErrorX2:24.497
ErrorX:25.550   ErrorX2:24.948   ErrorX2:24.352
ErrorX:25.439   ErrorX2:24.836   ErrorX2:24.239
ErrorX:26.538   ErrorX2:25.931   ErrorX2:25.330
ErrorX:25.375   ErrorX2:24.773   ErrorX2:24.177
ErrorX:26.081   ErrorX2:25.477   ErrorX2:24.878
ErrorX:25.327   ErrorX2:24.726   ErrorX2:24.130
ErrorX:25.764   ErrorX2:25.158   ErrorX2:24.558
ErrorX:25.831   ErrorX2:25.227   ErrorX2:24.629
ErrorX:25.441   ErrorX2:24.840   ErrorX2:24.246
ErrorX:25.811   ErrorX2:25.207   ErrorX2:24.608
ErrorX:25.632   ErrorX2:25.030   ErrorX2:24.435
ErrorX:25.783   ErrorX2:25.180   ErrorX2:24.582
ErrorX:25.294   ErrorX2:24.690   ErrorX2:24.093
ErrorX:25.373   ErrorX2:24.770   ErrorX2:24.173
ErrorX:25.560   ErrorX2:24.956   ErrorX2:24.358
ErrorX:26.182   ErrorX2:25.577   ErrorX2:24.977
ErrorX:26.283   ErrorX2:25.678   ErrorX2:25.080
ErrorX:26.099   ErrorX2:25.494   ErrorX2:24.895
ErrorX:25.429   ErrorX2:24.829   ErrorX2:24.235
ErrorX:24.666   ErrorX2:24.083   ErrorX2:23.508
ErrorX:24.783   ErrorX2:24.183   ErrorX2:23.588
ErrorX:25.087   ErrorX2:24.486   ErrorX2:23.892
ErrorX:24.824   ErrorX2:24.223   ErrorX2:23.629
ErrorX:25.160   ErrorX2:24.557   ErrorX2:23.960
ErrorX:24.732   ErrorX2:24.133   ErrorX2:23.541
ErrorX:24.879   ErrorX2:24.276   ErrorX2:23.679
ErrorX:25.771   ErrorX2:25.165   ErrorX2:24.565
ErrorX:24.673   ErrorX2:24.072   ErrorX2:23.478
ErrorX:24.885   ErrorX2:24.285   ErrorX2:23.691
ErrorX:24.811   ErrorX2:24.209   ErrorX2:23.614
ErrorX:24.899   ErrorX2:24.297   ErrorX2:23.701
ErrorX:24.719   ErrorX2:24.117   ErrorX2:23.522
ErrorX:25.151   ErrorX2:24.548   ErrorX2:23.950
ErrorX:25.047   ErrorX2:24.445   ErrorX2:23.850
ErrorX:25.210   ErrorX2:24.607   ErrorX2:24.010
ErrorX:25.190   ErrorX2:24.587   ErrorX2:23.990
ErrorX:24.671   ErrorX2:24.072   ErrorX2:23.478
ErrorX:24.979   ErrorX2:24.376   ErrorX2:23.779
ErrorX:24.805   ErrorX2:24.205   ErrorX2:23.610
ErrorX:24.792   ErrorX2:24.191   ErrorX2:23.596
ErrorX:24.883   ErrorX2:24.284   ErrorX2:23.691
ErrorX:24.187   ErrorX2:23.589   ErrorX2:22.997
ErrorX:25.391   ErrorX2:24.788   ErrorX2:24.191
ErrorX:24.992   ErrorX2:24.390   ErrorX2:23.794
ErrorX:24.642   ErrorX2:24.042   ErrorX2:23.447
ErrorX:24.650   ErrorX2:24.049   ErrorX2:23.454
ErrorX:24.096   ErrorX2:23.498   ErrorX2:22.906

It's a little better but no where near good enough. I need to find out what's wrong..



In [49]:

    
xs3 = [np.stack(x[:]) for x in xs2]
xs3[0].shape









    Out[49]:





(75110, 1)



In [59]:

    
xs3









    Out[59]:





[array([[40],
        [ 1],
        [33],
        ..., 
        [72],
        [71],
        [61]]), array([[42],
        [ 1],
        [38],
        ..., 
        [73],
        [65],
        [58]]), array([[29],
        [43],
        [31],
        ..., 
        [62],
        [57],
        [ 2]]), array([[30],
        [45],
        [ 2],
        ..., 
        [54],
        [ 2],
        [62]]), array([[25],
        [40],
        [73],
        ..., 
        [67],
        [54],
        [67]]), array([[27],
        [40],
        [61],
        ..., 
        [ 2],
        [72],
        [57]]), array([[29],
        [39],
        [54],
        ..., 
        [76],
        [ 2],
        [62]]), array([[ 1],
        [43],
        [73],
        ..., 
        [68],
        [73],
        [56]])]

xs3 is in the proper format.. so I have to look at the oh_xs3 and oh_xs3_rnn versions



In [60]:

    
oh_xs3[0]









    Out[60]:





array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: