Recurrent Neural Networks

For an introduction to RNN take a look at this great article.

Basic RNNs


In [1]:
# Common imports
import numpy as np
import numpy.random as rnd
import os

# to make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Tensorflow 
import tensorflow as tf

# 
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "b<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

Manual RNN


In [2]:
tf.reset_default_graph()

n_inputs = 3
n_neurons = 5

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons], dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons, n_neurons], dtype=tf.float32))
b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))

Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)

init = tf.global_variables_initializer()

X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
    
print(Y0_val)


[[-0.98634392  0.81788576  0.8470158  -0.99247205  0.68880105]
 [-1.         -0.95546728  0.99203932 -0.99999928 -0.45464534]
 [-1.         -0.99989605  0.99961442 -1.         -0.94950742]
 [-1.         -0.99988145 -0.99999958 -1.         -0.99962741]]

In [3]:
print(Y1_val)


[[-1.         -1.          0.96730083 -1.         -0.99997342]
 [ 0.8294723   0.02448707 -0.94491893 -0.92168093 -0.9122532 ]
 [-1.         -0.99999344 -0.28294998 -0.99999934 -0.99998403]
 [-0.99977511 -0.99972141  0.11019414 -0.98003727 -0.99999589]]

Using rnn()

The static_rnn() function creates an unrolled RNN network by chaining cells.


In [4]:
tf.reset_default_graph()

n_inputs = 3
n_neurons = 5

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, [X0, X1], dtype=tf.float32)
Y0, Y1 = output_seqs

init = tf.global_variables_initializer()

X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]])
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]])

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
    
Y0_val


Out[4]:
array([[-0.48671275, -0.48561448,  0.86629325, -0.72946197, -0.53452528],
       [-0.99417853, -0.77786744,  0.99997634,  0.10450334,  0.41930473],
       [-0.99995065, -0.91371775,  1.        ,  0.8135196 ,  0.90335333],
       [-0.96949047, -0.57477111,  0.9999451 ,  0.99996626,  0.99443597]], dtype=float32)

In [5]:
Y1_val


Out[5]:
array([[-0.99998569, -0.86334312,  1.        ,  0.99911886,  0.99962145],
       [ 0.3538464 ,  0.68475449,  0.39988694,  0.07795403,  0.6127547 ],
       [-0.999672  ,  0.78515959,  0.99999964,  0.99463904,  0.99695826],
       [-0.96491492,  0.90873224,  0.99751562,  0.96221972,  0.96019566]], dtype=float32)

In [6]:
#show_graph(tf.get_default_graph())

Using dynamic_rnn()

The dynamic_rnn() function uses a while_loop() operation to run over the cell the appropriate number of times, and you can set swap_memory = True if you want it to swap the GPU’s memory to the CPU’s memory during backpropagation to avoid OOM errors. Conveniently, it also accepts a single tensor for all inputs at every time step (shape [None, n_steps, n_inputs]) and it outputs a single tensor for all outputs at every time step (shape [None, n_steps, n_neurons]); there is no need to stack, unstack, or transpose.


In [7]:
tf.reset_default_graph()

n_steps = 2
n_inputs = 3
n_neurons = 5

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

init = tf.global_variables_initializer()

X_batch = np.array([
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])

with tf.Session() as sess:
    init.run()
    print("outputs =", outputs.eval(feed_dict={X: X_batch}))


outputs = [[[-0.65551144  0.73566824 -0.85184079  0.1408051  -0.67453671]
  [ 0.99524468  0.91472399 -1.          0.99938375  0.91451889]]

 [[-0.37057179  0.9426769  -0.99993008  0.88595641 -0.4519437 ]
  [ 0.61579293  0.03843237  0.7278012   0.42999566  0.21875195]]

 [[ 0.00673045  0.98863083 -1.00000012  0.99033666 -0.15402398]
  [ 0.95808011  0.56590128 -0.99997652  0.9931702   0.9524017 ]]

 [[ 0.92239261 -0.67527854 -0.99997818  0.99998701  0.99996889]
  [ 0.14767335 -0.83654004 -0.99008358  0.92541546  0.95951402]]]

In [8]:
#show_graph(tf.get_default_graph())

Packing sequences


In [10]:
tf.reset_default_graph()

n_steps = 2
n_inputs = 3
n_neurons = 5

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
seq_length = tf.placeholder(tf.int32, [None])   ### <----------------------------------------

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, sequence_length=seq_length, dtype=tf.float32)

init = tf.global_variables_initializer()

X_batch = np.array([
        # step 0     step 1
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2 (padded with zero vectors)
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])


seq_length_batch = np.array([2, 1, 2, 2])  ### <------------------------

with tf.Session() as sess:
    init.run()
    outputs_val, states_val = sess.run(
        [outputs, states], feed_dict={X: X_batch, seq_length: seq_length_batch})
    
print(outputs_val)


[[[-0.79451835 -0.44819504 -0.59462738 -0.24471386  0.91153216]
  [-0.9990105   0.97848833 -0.99174058  0.93728083  0.99999642]]

 [[-0.98845619 -0.0552582  -0.95545596  0.09708088  0.99973947]
  [ 0.          0.          0.          0.          0.        ]]

 [[-0.99941158  0.35557428 -0.9959259   0.4174149   0.99999923]
  [-0.98805785  0.81141108 -0.9052664   0.9342975   0.99974763]]

 [[ 0.77583563  0.58787191 -0.86811101  0.99349481  0.31281272]
  [-0.45480198 -0.14292759 -0.28857675  0.82472932  0.97620714]]]

In [11]:
print(states_val)


[[-0.9990105   0.97848833 -0.99174058  0.93728083  0.99999642]
 [-0.98845619 -0.0552582  -0.95545596  0.09708088  0.99973947]
 [-0.98805785  0.81141108 -0.9052664   0.9342975   0.99974763]
 [-0.45480198 -0.14292759 -0.28857675  0.82472932  0.97620714]]

Training a sequence classifier

We will treat each image as a sequence of 28 rows of 28 pixels each (since each MNIST image is 28 × 28 pixels). We will use cells of 150 recurrent neurons, plus a fully connected layer containing 10 neurons (one per class) connected to the output of the last time step, followed by a softmax layer.


In [12]:
tf.reset_default_graph()

from tensorflow.contrib.layers import fully_connected

n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

with tf.variable_scope("rnn", initializer=tf.contrib.layers.variance_scaling_initializer()):
    basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
    outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

logits = fully_connected(states, n_outputs, activation_fn=None)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels

n_epochs = 100
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((-1, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)


Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.92 Test accuracy: 0.9297
1 Train accuracy: 0.953333 Test accuracy: 0.9524
2 Train accuracy: 0.96 Test accuracy: 0.9613
3 Train accuracy: 0.98 Test accuracy: 0.9688
4 Train accuracy: 0.946667 Test accuracy: 0.9607
5 Train accuracy: 0.96 Test accuracy: 0.9676
6 Train accuracy: 0.98 Test accuracy: 0.9729
7 Train accuracy: 0.973333 Test accuracy: 0.9747
8 Train accuracy: 0.973333 Test accuracy: 0.9711
9 Train accuracy: 0.966667 Test accuracy: 0.9746
10 Train accuracy: 0.986667 Test accuracy: 0.974
11 Train accuracy: 0.993333 Test accuracy: 0.9731
12 Train accuracy: 0.973333 Test accuracy: 0.972
13 Train accuracy: 0.993333 Test accuracy: 0.968
14 Train accuracy: 0.986667 Test accuracy: 0.9779
15 Train accuracy: 1.0 Test accuracy: 0.9778
16 Train accuracy: 0.993333 Test accuracy: 0.9828
17 Train accuracy: 0.993333 Test accuracy: 0.9757
18 Train accuracy: 1.0 Test accuracy: 0.9759
19 Train accuracy: 0.986667 Test accuracy: 0.979
20 Train accuracy: 0.973333 Test accuracy: 0.9747
21 Train accuracy: 0.986667 Test accuracy: 0.9786
22 Train accuracy: 0.98 Test accuracy: 0.9775
23 Train accuracy: 0.986667 Test accuracy: 0.9792
24 Train accuracy: 0.993333 Test accuracy: 0.9798
25 Train accuracy: 0.993333 Test accuracy: 0.9741
26 Train accuracy: 0.98 Test accuracy: 0.9793
27 Train accuracy: 1.0 Test accuracy: 0.9755
28 Train accuracy: 0.986667 Test accuracy: 0.9823
29 Train accuracy: 0.993333 Test accuracy: 0.9802
30 Train accuracy: 0.986667 Test accuracy: 0.9804
31 Train accuracy: 0.986667 Test accuracy: 0.9781
32 Train accuracy: 0.986667 Test accuracy: 0.9757
33 Train accuracy: 0.993333 Test accuracy: 0.9795
34 Train accuracy: 0.993333 Test accuracy: 0.9761
35 Train accuracy: 0.993333 Test accuracy: 0.9778
36 Train accuracy: 1.0 Test accuracy: 0.9843
37 Train accuracy: 0.986667 Test accuracy: 0.9827
38 Train accuracy: 0.993333 Test accuracy: 0.9758
39 Train accuracy: 1.0 Test accuracy: 0.9811
40 Train accuracy: 0.98 Test accuracy: 0.9806
41 Train accuracy: 0.986667 Test accuracy: 0.9813
42 Train accuracy: 0.986667 Test accuracy: 0.9828
43 Train accuracy: 0.993333 Test accuracy: 0.978
44 Train accuracy: 1.0 Test accuracy: 0.9844
45 Train accuracy: 0.993333 Test accuracy: 0.9823
46 Train accuracy: 1.0 Test accuracy: 0.9828
47 Train accuracy: 1.0 Test accuracy: 0.9836
48 Train accuracy: 0.986667 Test accuracy: 0.9814
49 Train accuracy: 0.993333 Test accuracy: 0.981
50 Train accuracy: 1.0 Test accuracy: 0.9831
51 Train accuracy: 0.986667 Test accuracy: 0.9816
52 Train accuracy: 0.993333 Test accuracy: 0.9839
53 Train accuracy: 0.993333 Test accuracy: 0.9824
54 Train accuracy: 0.993333 Test accuracy: 0.981
55 Train accuracy: 0.993333 Test accuracy: 0.9827
56 Train accuracy: 1.0 Test accuracy: 0.9818
57 Train accuracy: 0.993333 Test accuracy: 0.9791
58 Train accuracy: 0.986667 Test accuracy: 0.9825
59 Train accuracy: 0.993333 Test accuracy: 0.9812
60 Train accuracy: 0.993333 Test accuracy: 0.9787
61 Train accuracy: 1.0 Test accuracy: 0.9824
62 Train accuracy: 0.993333 Test accuracy: 0.9764
63 Train accuracy: 0.993333 Test accuracy: 0.9836
64 Train accuracy: 0.993333 Test accuracy: 0.9823
65 Train accuracy: 1.0 Test accuracy: 0.9812
66 Train accuracy: 1.0 Test accuracy: 0.9825
67 Train accuracy: 0.98 Test accuracy: 0.9793
68 Train accuracy: 0.986667 Test accuracy: 0.9814
69 Train accuracy: 0.993333 Test accuracy: 0.9801
70 Train accuracy: 0.993333 Test accuracy: 0.9804
71 Train accuracy: 0.993333 Test accuracy: 0.9834
72 Train accuracy: 1.0 Test accuracy: 0.9823
73 Train accuracy: 1.0 Test accuracy: 0.983
74 Train accuracy: 1.0 Test accuracy: 0.9833
75 Train accuracy: 0.993333 Test accuracy: 0.9806
76 Train accuracy: 0.966667 Test accuracy: 0.9776
77 Train accuracy: 0.986667 Test accuracy: 0.9775
78 Train accuracy: 0.993333 Test accuracy: 0.9851
79 Train accuracy: 0.986667 Test accuracy: 0.9831
80 Train accuracy: 0.993333 Test accuracy: 0.9837
81 Train accuracy: 1.0 Test accuracy: 0.9818
82 Train accuracy: 1.0 Test accuracy: 0.982
83 Train accuracy: 1.0 Test accuracy: 0.9823
84 Train accuracy: 0.986667 Test accuracy: 0.9834
85 Train accuracy: 1.0 Test accuracy: 0.9825
86 Train accuracy: 1.0 Test accuracy: 0.9823
87 Train accuracy: 0.986667 Test accuracy: 0.9793
88 Train accuracy: 0.993333 Test accuracy: 0.9824
89 Train accuracy: 1.0 Test accuracy: 0.9797
90 Train accuracy: 1.0 Test accuracy: 0.9799
91 Train accuracy: 0.993333 Test accuracy: 0.9806
92 Train accuracy: 1.0 Test accuracy: 0.9847
93 Train accuracy: 0.993333 Test accuracy: 0.9852
94 Train accuracy: 1.0 Test accuracy: 0.9836
95 Train accuracy: 1.0 Test accuracy: 0.9835
96 Train accuracy: 1.0 Test accuracy: 0.9831
97 Train accuracy: 1.0 Test accuracy: 0.9829
98 Train accuracy: 0.993333 Test accuracy: 0.9803
99 Train accuracy: 1.0 Test accuracy: 0.9813

Training the same sequence classifier with Keras


In [30]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import SimpleRNN
from keras import initializers
from keras.optimizers import RMSprop

from keras.models import Model
from keras.layers import Input, Dense

batch_size = 150
num_classes = 10
epochs = 100
hidden_units = 150

learning_rate = 0.001

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], 28, 28)
x_test = x_test.reshape(x_test.shape[0], 28, 28)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print('Evaluate IRNN...')
a = Input(shape=x_train.shape[1:])
b = SimpleRNN(hidden_units,
                    kernel_initializer=initializers.RandomNormal(stddev=0.001),
                    recurrent_initializer=initializers.Identity(),
                    activation='relu')(a)
b = Dense(num_classes)(b)
b = Activation('softmax')(b)
optimizer = keras.optimizers.Adamax(lr=learning_rate)
model = Model(inputs=[a], outputs=[b])
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

scores = model.evaluate(x_test, y_test, verbose=0)
print('IRNN test score:', scores[0])
print('IRNN test accuracy:', scores[1])


x_train shape: (60000, 28, 28)
60000 train samples
10000 test samples
Evaluate IRNN...
Train on 60000 samples, validate on 10000 samples
Epoch 1/100
60000/60000 [==============================] - 21s - loss: 0.7979 - acc: 0.7312 - val_loss: 0.3889 - val_acc: 0.8750
Epoch 2/100
60000/60000 [==============================] - 21s - loss: 0.3082 - acc: 0.9014 - val_loss: 0.2438 - val_acc: 0.9230
Epoch 3/100
60000/60000 [==============================] - 20s - loss: 0.2165 - acc: 0.9318 - val_loss: 0.2059 - val_acc: 0.9335
Epoch 4/100
60000/60000 [==============================] - 22s - loss: 0.1745 - acc: 0.9449 - val_loss: 0.1507 - val_acc: 0.9531
Epoch 5/100
60000/60000 [==============================] - 20s - loss: 0.1429 - acc: 0.9544 - val_loss: 0.1242 - val_acc: 0.9608
Epoch 6/100
60000/60000 [==============================] - 20s - loss: 0.1263 - acc: 0.9608 - val_loss: 0.1186 - val_acc: 0.9642
Epoch 7/100
60000/60000 [==============================] - 21s - loss: 0.1089 - acc: 0.9658 - val_loss: 0.1103 - val_acc: 0.9654
Epoch 8/100
60000/60000 [==============================] - 20s - loss: 0.0993 - acc: 0.9686 - val_loss: 0.1116 - val_acc: 0.9652
Epoch 9/100
60000/60000 [==============================] - 21s - loss: 0.0955 - acc: 0.9699 - val_loss: 0.1118 - val_acc: 0.9648
Epoch 10/100
60000/60000 [==============================] - 21s - loss: 0.0896 - acc: 0.9726 - val_loss: 0.1025 - val_acc: 0.9686
Epoch 11/100
60000/60000 [==============================] - 23s - loss: 0.0772 - acc: 0.9762 - val_loss: 0.0877 - val_acc: 0.9728
Epoch 12/100
60000/60000 [==============================] - 21s - loss: 0.0732 - acc: 0.9770 - val_loss: 0.0735 - val_acc: 0.9749
Epoch 13/100
60000/60000 [==============================] - 21s - loss: 0.0703 - acc: 0.9778 - val_loss: 0.0839 - val_acc: 0.9715
Epoch 14/100
60000/60000 [==============================] - 21s - loss: 0.0646 - acc: 0.9795 - val_loss: 0.0946 - val_acc: 0.9717
Epoch 15/100
60000/60000 [==============================] - 20s - loss: 0.0571 - acc: 0.9821 - val_loss: 0.0799 - val_acc: 0.9765
Epoch 16/100
60000/60000 [==============================] - 21s - loss: 0.0558 - acc: 0.9824 - val_loss: 0.0701 - val_acc: 0.9782
Epoch 17/100
60000/60000 [==============================] - 23s - loss: 0.0508 - acc: 0.9839 - val_loss: 0.1032 - val_acc: 0.9692
Epoch 18/100
60000/60000 [==============================] - 23s - loss: 0.0484 - acc: 0.9847 - val_loss: 0.0682 - val_acc: 0.9790
Epoch 19/100
60000/60000 [==============================] - 22s - loss: 0.0463 - acc: 0.9852 - val_loss: 0.0682 - val_acc: 0.9793
Epoch 20/100
60000/60000 [==============================] - 21s - loss: 0.0424 - acc: 0.9869 - val_loss: 0.0745 - val_acc: 0.9781
Epoch 21/100
60000/60000 [==============================] - 22s - loss: 0.0415 - acc: 0.9870 - val_loss: 0.0682 - val_acc: 0.9781.
Epoch 22/100
60000/60000 [==============================] - 21s - loss: 0.0371 - acc: 0.9879 - val_loss: 0.0692 - val_acc: 0.9794
Epoch 23/100
60000/60000 [==============================] - 20s - loss: 0.0389 - acc: 0.9877 - val_loss: 0.0636 - val_acc: 0.9827
Epoch 24/100
60000/60000 [==============================] - 21s - loss: 0.0336 - acc: 0.9890 - val_loss: 0.0645 - val_acc: 0.9817
Epoch 25/100
60000/60000 [==============================] - 21s - loss: 0.0318 - acc: 0.9895 - val_loss: 0.0570 - val_acc: 0.9812
Epoch 26/100
60000/60000 [==============================] - 21s - loss: 0.0314 - acc: 0.9900 - val_loss: 0.0588 - val_acc: 0.9825
Epoch 27/100
60000/60000 [==============================] - 21s - loss: 0.0267 - acc: 0.9915 - val_loss: 0.0638 - val_acc: 0.9803
Epoch 28/100
60000/60000 [==============================] - 21s - loss: 0.0273 - acc: 0.9909 - val_loss: 0.0639 - val_acc: 0.9812
Epoch 29/100
60000/60000 [==============================] - 21s - loss: 0.0278 - acc: 0.9912 - val_loss: 0.0696 - val_acc: 0.9805
Epoch 30/100
60000/60000 [==============================] - 22s - loss: 0.0243 - acc: 0.9919 - val_loss: 0.0725 - val_acc: 0.9809
Epoch 31/100
60000/60000 [==============================] - 20s - loss: 0.0248 - acc: 0.9922 - val_loss: 0.0653 - val_acc: 0.9828
Epoch 32/100
60000/60000 [==============================] - 20s - loss: 0.0221 - acc: 0.9926 - val_loss: 0.0621 - val_acc: 0.9833
Epoch 33/100
60000/60000 [==============================] - 21s - loss: 0.0226 - acc: 0.9928 - val_loss: 0.0694 - val_acc: 0.9817
Epoch 34/100
60000/60000 [==============================] - 21s - loss: 0.0203 - acc: 0.9931 - val_loss: 0.0657 - val_acc: 0.9818
Epoch 35/100
60000/60000 [==============================] - 22s - loss: 0.0192 - acc: 0.9937 - val_loss: 0.0590 - val_acc: 0.9836
Epoch 36/100
60000/60000 [==============================] - 21s - loss: 0.0185 - acc: 0.9937 - val_loss: 0.0611 - val_acc: 0.9825
Epoch 37/100
60000/60000 [==============================] - 21s - loss: 0.0168 - acc: 0.9949 - val_loss: 0.0611 - val_acc: 0.9848
Epoch 38/100
60000/60000 [==============================] - 20s - loss: 0.0171 - acc: 0.9945 - val_loss: 0.0712 - val_acc: 0.9834
Epoch 39/100
60000/60000 [==============================] - 21s - loss: 0.0155 - acc: 0.9951 - val_loss: 0.0622 - val_acc: 0.9831
Epoch 40/100
60000/60000 [==============================] - 21s - loss: 0.0146 - acc: 0.9954 - val_loss: 0.0668 - val_acc: 0.9822
Epoch 41/100
60000/60000 [==============================] - 22s - loss: 0.0147 - acc: 0.9953 - val_loss: 0.0626 - val_acc: 0.9829
Epoch 42/100
60000/60000 [==============================] - 22s - loss: 0.0131 - acc: 0.9957 - val_loss: 0.0611 - val_acc: 0.9855
Epoch 43/100
60000/60000 [==============================] - 21s - loss: 0.0150 - acc: 0.9952 - val_loss: 0.0666 - val_acc: 0.9831
Epoch 44/100
60000/60000 [==============================] - 21s - loss: 0.0118 - acc: 0.9964 - val_loss: 0.0604 - val_acc: 0.9849
Epoch 45/100
60000/60000 [==============================] - 21s - loss: 0.0117 - acc: 0.9963 - val_loss: 0.0803 - val_acc: 0.9801
Epoch 46/100
60000/60000 [==============================] - 22s - loss: 0.0130 - acc: 0.9959 - val_loss: 0.0763 - val_acc: 0.9813
Epoch 47/100
60000/60000 [==============================] - 23s - loss: 0.0118 - acc: 0.9961 - val_loss: 0.0677 - val_acc: 0.9835
Epoch 48/100
60000/60000 [==============================] - 22s - loss: 0.0106 - acc: 0.9967 - val_loss: 0.0692 - val_acc: 0.9847
Epoch 49/100
60000/60000 [==============================] - 22s - loss: 0.0112 - acc: 0.9964 - val_loss: 0.0691 - val_acc: 0.9816
Epoch 50/100
60000/60000 [==============================] - 21s - loss: 0.0123 - acc: 0.9960 - val_loss: 0.0637 - val_acc: 0.9850
Epoch 51/100
60000/60000 [==============================] - 21s - loss: 0.0094 - acc: 0.9970 - val_loss: 0.0666 - val_acc: 0.9845
Epoch 52/100
60000/60000 [==============================] - 21s - loss: 0.0101 - acc: 0.9969 - val_loss: 0.0767 - val_acc: 0.9844
Epoch 53/100
60000/60000 [==============================] - 21s - loss: 0.0078 - acc: 0.9974 - val_loss: 0.0806 - val_acc: 0.9815
Epoch 54/100
60000/60000 [==============================] - 21s - loss: 0.0106 - acc: 0.9965 - val_loss: 0.0745 - val_acc: 0.9845
Epoch 55/100
60000/60000 [==============================] - 21s - loss: 0.0072 - acc: 0.9979 - val_loss: 0.0770 - val_acc: 0.9842
Epoch 56/100
60000/60000 [==============================] - 21s - loss: 0.0084 - acc: 0.9973 - val_loss: 0.0720 - val_acc: 0.9834
Epoch 57/100
60000/60000 [==============================] - 21s - loss: 0.0071 - acc: 0.9977 - val_loss: 0.0708 - val_acc: 0.9837
Epoch 58/100
60000/60000 [==============================] - 21s - loss: 0.0082 - acc: 0.9972 - val_loss: 0.0738 - val_acc: 0.9845
Epoch 59/100
60000/60000 [==============================] - 21s - loss: 0.0088 - acc: 0.9969 - val_loss: 0.0817 - val_acc: 0.9817
Epoch 60/100
60000/60000 [==============================] - 23s - loss: 0.0082 - acc: 0.9976 - val_loss: 0.0700 - val_acc: 0.9839
Epoch 61/100
60000/60000 [==============================] - 22s - loss: 0.0059 - acc: 0.9983 - val_loss: 0.0725 - val_acc: 0.9836
Epoch 62/100
60000/60000 [==============================] - 21s - loss: 0.0077 - acc: 0.9975 - val_loss: 0.0728 - val_acc: 0.9841
Epoch 63/100
60000/60000 [==============================] - 20s - loss: 0.0062 - acc: 0.9980 - val_loss: 0.0724 - val_acc: 0.9831
Epoch 64/100
60000/60000 [==============================] - 20s - loss: 0.0059 - acc: 0.9980 - val_loss: 0.0812 - val_acc: 0.9839
Epoch 65/100
60000/60000 [==============================] - 21s - loss: 0.0070 - acc: 0.9977 - val_loss: 0.0799 - val_acc: 0.9837
Epoch 66/100
60000/60000 [==============================] - 20s - loss: 0.0064 - acc: 0.9979 - val_loss: 0.1002 - val_acc: 0.9821
Epoch 67/100
60000/60000 [==============================] - 21s - loss: 0.0072 - acc: 0.9977 - val_loss: 0.0790 - val_acc: 0.9841.0073 
Epoch 68/100
60000/60000 [==============================] - 22s - loss: 0.0069 - acc: 0.9976 - val_loss: 0.0788 - val_acc: 0.9839
Epoch 69/100
60000/60000 [==============================] - 21s - loss: 0.0061 - acc: 0.9982 - val_loss: 0.0979 - val_acc: 0.9791
Epoch 70/100
60000/60000 [==============================] - 21s - loss: 0.0067 - acc: 0.9979 - val_loss: 0.0839 - val_acc: 0.9839
Epoch 71/100
60000/60000 [==============================] - 22s - loss: 0.0064 - acc: 0.9982 - val_loss: 0.0744 - val_acc: 0.9857
Epoch 72/100
60000/60000 [==============================] - 23s - loss: 0.0048 - acc: 0.9983 - val_loss: 0.0688 - val_acc: 0.9858
Epoch 73/100
60000/60000 [==============================] - 20s - loss: 0.0058 - acc: 0.9982 - val_loss: 0.0898 - val_acc: 0.9836
Epoch 74/100
60000/60000 [==============================] - 21s - loss: 0.0057 - acc: 0.9982 - val_loss: 0.0897 - val_acc: 0.9829
Epoch 75/100
60000/60000 [==============================] - 20s - loss: 0.0057 - acc: 0.9981 - val_loss: 0.0704 - val_acc: 0.9855
Epoch 76/100
60000/60000 [==============================] - 21s - loss: 0.0036 - acc: 0.9990 - val_loss: 0.0792 - val_acc: 0.9842
Epoch 77/100
60000/60000 [==============================] - 21s - loss: 0.0043 - acc: 0.9988 - val_loss: 0.0889 - val_acc: 0.9837
Epoch 78/100
60000/60000 [==============================] - 21s - loss: 0.0060 - acc: 0.9981 - val_loss: 0.0764 - val_acc: 0.9846
Epoch 79/100
60000/60000 [==============================] - 21s - loss: 0.0049 - acc: 0.9986 - val_loss: 0.0842 - val_acc: 0.9841
Epoch 80/100
60000/60000 [==============================] - 23s - loss: 0.0040 - acc: 0.9990 - val_loss: 0.0765 - val_acc: 0.9861
Epoch 81/100
60000/60000 [==============================] - 19s - loss: 0.0021 - acc: 0.9995 - val_loss: 0.0842 - val_acc: 0.9837
Epoch 82/100
60000/60000 [==============================] - 21s - loss: 0.0059 - acc: 0.9981 - val_loss: 0.0840 - val_acc: 0.9853.
Epoch 83/100
60000/60000 [==============================] - 21s - loss: 0.0059 - acc: 0.9979 - val_loss: 0.0853 - val_acc: 0.9837
Epoch 84/100
60000/60000 [==============================] - 21s - loss: 0.0062 - acc: 0.9983 - val_loss: 0.0775 - val_acc: 0.9853
Epoch 85/100
60000/60000 [==============================] - 20s - loss: 0.0024 - acc: 0.9994 - val_loss: 0.0793 - val_acc: 0.9852
Epoch 86/100
60000/60000 [==============================] - 19s - loss: 0.0074 - acc: 0.9978 - val_loss: 0.0912 - val_acc: 0.9827
Epoch 87/100
60000/60000 [==============================] - 21s - loss: 0.0028 - acc: 0.9993 - val_loss: 0.0795 - val_acc: 0.9849
Epoch 88/100
60000/60000 [==============================] - 22s - loss: 0.0011 - acc: 0.9998 - val_loss: 0.0862 - val_acc: 0.9837
Epoch 89/100
60000/60000 [==============================] - 22s - loss: 0.0036 - acc: 0.9991 - val_loss: 0.0833 - val_acc: 0.9860
Epoch 90/100
60000/60000 [==============================] - 21s - loss: 0.0050 - acc: 0.9985 - val_loss: 0.0937 - val_acc: 0.9843
Epoch 91/100
60000/60000 [==============================] - 23s - loss: 0.0032 - acc: 0.9991 - val_loss: 0.0879 - val_acc: 0.9843
Epoch 92/100
60000/60000 [==============================] - 21s - loss: 0.0045 - acc: 0.9984 - val_loss: 0.0903 - val_acc: 0.9816
Epoch 93/100
60000/60000 [==============================] - 21s - loss: 0.0039 - acc: 0.9988 - val_loss: 0.0958 - val_acc: 0.9830
Epoch 94/100
60000/60000 [==============================] - 21s - loss: 0.0050 - acc: 0.9983 - val_loss: 0.0837 - val_acc: 0.9851
Epoch 95/100
60000/60000 [==============================] - 21s - loss: 0.0027 - acc: 0.9993 - val_loss: 0.0925 - val_acc: 0.9848
Epoch 96/100
60000/60000 [==============================] - 21s - loss: 0.0040 - acc: 0.9988 - val_loss: 0.0879 - val_acc: 0.9844
Epoch 97/100
60000/60000 [==============================] - 21s - loss: 0.0023 - acc: 0.9994 - val_loss: 0.0995 - val_acc: 0.9838
Epoch 98/100
60000/60000 [==============================] - 23s - loss: 0.0047 - acc: 0.9986 - val_loss: 0.0882 - val_acc: 0.9830
Epoch 99/100
60000/60000 [==============================] - 22s - loss: 0.0035 - acc: 0.9991 - val_loss: 0.0917 - val_acc: 0.9838
Epoch 100/100
60000/60000 [==============================] - 23s - loss: 0.0041 - acc: 0.9989 - val_loss: 0.0874 - val_acc: 0.9844
IRNN test score: 0.0874122375119
IRNN test accuracy: 0.9844

Multi-layer RNN

It is quite common to stack multiple layers of cells. This gives you a deep RNN. To implement a deep RNN in TensorFlow, you can create several cells and stack them into a MultiRNNCell.


In [32]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels

tf.reset_default_graph()

from tensorflow.contrib.layers import fully_connected

n_steps = 28
n_inputs = 28
n_neurons1 = 150
n_neurons2 = 100
n_outputs = 10

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

hidden1 = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons1, activation=tf.nn.relu)
hidden2 = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons2, activation=tf.nn.relu)
multi_layer_cell = tf.contrib.rnn.MultiRNNCell([hidden1, hidden2])
outputs, states_tuple = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
states = tf.concat(axis=1, values=states_tuple)
logits = fully_connected(states, n_outputs, activation_fn=None)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

n_epochs = 100
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((-1, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.926667 Test accuracy: 0.9232
1 Train accuracy: 0.973333 Test accuracy: 0.9529
2 Train accuracy: 0.966667 Test accuracy: 0.9612
3 Train accuracy: 0.98 Test accuracy: 0.9734
4 Train accuracy: 0.973333 Test accuracy: 0.968
5 Train accuracy: 0.973333 Test accuracy: 0.9664
6 Train accuracy: 0.98 Test accuracy: 0.9789
7 Train accuracy: 0.98 Test accuracy: 0.9729
8 Train accuracy: 0.993333 Test accuracy: 0.9771
9 Train accuracy: 0.96 Test accuracy: 0.9758
10 Train accuracy: 0.993333 Test accuracy: 0.9778
11 Train accuracy: 0.96 Test accuracy: 0.9824
12 Train accuracy: 1.0 Test accuracy: 0.9782
13 Train accuracy: 0.966667 Test accuracy: 0.9815
14 Train accuracy: 0.98 Test accuracy: 0.9819
15 Train accuracy: 1.0 Test accuracy: 0.984
16 Train accuracy: 0.986667 Test accuracy: 0.9763
17 Train accuracy: 0.993333 Test accuracy: 0.9841
18 Train accuracy: 0.98 Test accuracy: 0.9807
19 Train accuracy: 1.0 Test accuracy: 0.9837
20 Train accuracy: 0.986667 Test accuracy: 0.9822
21 Train accuracy: 0.98 Test accuracy: 0.9819
22 Train accuracy: 0.98 Test accuracy: 0.9832
23 Train accuracy: 0.98 Test accuracy: 0.9799
24 Train accuracy: 1.0 Test accuracy: 0.983
25 Train accuracy: 0.986667 Test accuracy: 0.9818
26 Train accuracy: 0.986667 Test accuracy: 0.9831
27 Train accuracy: 0.993333 Test accuracy: 0.9842
28 Train accuracy: 1.0 Test accuracy: 0.9805
29 Train accuracy: 1.0 Test accuracy: 0.9833
30 Train accuracy: 0.993333 Test accuracy: 0.9846
31 Train accuracy: 1.0 Test accuracy: 0.9841
32 Train accuracy: 0.993333 Test accuracy: 0.9841
33 Train accuracy: 1.0 Test accuracy: 0.9866
34 Train accuracy: 0.986667 Test accuracy: 0.982
35 Train accuracy: 1.0 Test accuracy: 0.9807
36 Train accuracy: 0.993333 Test accuracy: 0.9843
37 Train accuracy: 0.993333 Test accuracy: 0.9866
38 Train accuracy: 0.993333 Test accuracy: 0.9828
39 Train accuracy: 0.986667 Test accuracy: 0.9753
40 Train accuracy: 0.973333 Test accuracy: 0.9845
41 Train accuracy: 0.986667 Test accuracy: 0.9856
42 Train accuracy: 1.0 Test accuracy: 0.9841
43 Train accuracy: 0.98 Test accuracy: 0.9792
44 Train accuracy: 0.986667 Test accuracy: 0.9875
45 Train accuracy: 1.0 Test accuracy: 0.983
46 Train accuracy: 0.973333 Test accuracy: 0.9842
47 Train accuracy: 1.0 Test accuracy: 0.9853
48 Train accuracy: 0.986667 Test accuracy: 0.9816
49 Train accuracy: 0.993333 Test accuracy: 0.9826
50 Train accuracy: 1.0 Test accuracy: 0.9869
51 Train accuracy: 1.0 Test accuracy: 0.9853
52 Train accuracy: 0.993333 Test accuracy: 0.9787
53 Train accuracy: 1.0 Test accuracy: 0.9839
54 Train accuracy: 1.0 Test accuracy: 0.9866
55 Train accuracy: 0.993333 Test accuracy: 0.9856
56 Train accuracy: 1.0 Test accuracy: 0.9848
57 Train accuracy: 1.0 Test accuracy: 0.9854
58 Train accuracy: 1.0 Test accuracy: 0.9859
59 Train accuracy: 1.0 Test accuracy: 0.9873
60 Train accuracy: 1.0 Test accuracy: 0.9868
61 Train accuracy: 1.0 Test accuracy: 0.9862
62 Train accuracy: 1.0 Test accuracy: 0.9842
63 Train accuracy: 1.0 Test accuracy: 0.9854
64 Train accuracy: 0.993333 Test accuracy: 0.9842
65 Train accuracy: 0.986667 Test accuracy: 0.9863
66 Train accuracy: 1.0 Test accuracy: 0.9867
67 Train accuracy: 1.0 Test accuracy: 0.9835
68 Train accuracy: 0.993333 Test accuracy: 0.9855
69 Train accuracy: 1.0 Test accuracy: 0.9809
70 Train accuracy: 1.0 Test accuracy: 0.9868
71 Train accuracy: 0.993333 Test accuracy: 0.9853
72 Train accuracy: 0.993333 Test accuracy: 0.983
73 Train accuracy: 0.993333 Test accuracy: 0.9785
74 Train accuracy: 1.0 Test accuracy: 0.9849
75 Train accuracy: 1.0 Test accuracy: 0.9854
76 Train accuracy: 1.0 Test accuracy: 0.9843
77 Train accuracy: 0.986667 Test accuracy: 0.9837
78 Train accuracy: 1.0 Test accuracy: 0.9846
79 Train accuracy: 1.0 Test accuracy: 0.9881
80 Train accuracy: 1.0 Test accuracy: 0.9839
81 Train accuracy: 1.0 Test accuracy: 0.9864
82 Train accuracy: 1.0 Test accuracy: 0.9831
83 Train accuracy: 1.0 Test accuracy: 0.9851
84 Train accuracy: 0.986667 Test accuracy: 0.9838
85 Train accuracy: 1.0 Test accuracy: 0.9863
86 Train accuracy: 1.0 Test accuracy: 0.9864
87 Train accuracy: 1.0 Test accuracy: 0.9862
88 Train accuracy: 1.0 Test accuracy: 0.984
89 Train accuracy: 1.0 Test accuracy: 0.988
90 Train accuracy: 1.0 Test accuracy: 0.9865
91 Train accuracy: 0.993333 Test accuracy: 0.9853
92 Train accuracy: 1.0 Test accuracy: 0.9855
93 Train accuracy: 1.0 Test accuracy: 0.9869
94 Train accuracy: 1.0 Test accuracy: 0.9823
95 Train accuracy: 1.0 Test accuracy: 0.9853
96 Train accuracy: 0.993333 Test accuracy: 0.9866
97 Train accuracy: 1.0 Test accuracy: 0.9847
98 Train accuracy: 1.0 Test accuracy: 0.9872
99 Train accuracy: 1.0 Test accuracy: 0.9866

Multi-layer RNN with Keras

When stacking RNNs with Keras remember to set return_sequences=True on hidden layers.


In [39]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import SimpleRNN
from keras import initializers
from keras.optimizers import RMSprop

from keras.models import Model
from keras.layers import Input, Dense

keras.backend.clear_session()

batch_size = 150
num_classes = 10
epochs = 50 # instead of 100 (too much time)
hidden_units_1 = 150
hidden_units_2 = 100

learning_rate = 0.001

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], 28, 28)
x_test = x_test.reshape(x_test.shape[0], 28, 28)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print('Evaluate IRNN...')
a = Input(shape=x_train.shape[1:])
b = SimpleRNN(hidden_units_1,
                    kernel_initializer=initializers.RandomNormal(stddev=0.001),
                    recurrent_initializer=initializers.Identity(),
                    activation='relu' , return_sequences=True)(a)
b = SimpleRNN(hidden_units_2,
                    kernel_initializer=initializers.RandomNormal(stddev=0.001),
                    recurrent_initializer=initializers.Identity(),
                    activation='relu')(b)
b = Dense(num_classes)(b)
b = Activation('softmax')(b)
optimizer = keras.optimizers.Adamax(lr=learning_rate)
model = Model(inputs=[a], outputs=[b])
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

scores = model.evaluate(x_test, y_test, verbose=0)
print('IRNN test score:', scores[0])
print('IRNN test accuracy:', scores[1])


x_train shape: (60000, 28, 28)
60000 train samples
10000 test samples
Evaluate IRNN...
Train on 60000 samples, validate on 10000 samples
Epoch 1/50
60000/60000 [==============================] - 44s - loss: 0.7986 - acc: 0.7263 - val_loss: 0.3711 - val_acc: 0.8834
Epoch 2/50
60000/60000 [==============================] - 46s - loss: 0.2963 - acc: 0.9078 - val_loss: 0.2352 - val_acc: 0.9262
Epoch 3/50
60000/60000 [==============================] - 44s - loss: 0.1983 - acc: 0.9386 - val_loss: 0.2365 - val_acc: 0.9241
Epoch 4/50
60000/60000 [==============================] - 43s - loss: 0.1630 - acc: 0.9498 - val_loss: 0.1340 - val_acc: 0.9574
Epoch 5/50
60000/60000 [==============================] - 43s - loss: 0.1308 - acc: 0.9598 - val_loss: 0.1228 - val_acc: 0.9629
Epoch 6/50
60000/60000 [==============================] - 43s - loss: 0.1177 - acc: 0.9638 - val_loss: 0.0982 - val_acc: 0.9703
Epoch 7/50
60000/60000 [==============================] - 44s - loss: 0.1038 - acc: 0.9684 - val_loss: 0.0941 - val_acc: 0.9697
Epoch 8/50
60000/60000 [==============================] - 43s - loss: 0.0924 - acc: 0.9710 - val_loss: 0.0973 - val_acc: 0.9699
Epoch 9/50
60000/60000 [==============================] - 46s - loss: 0.0815 - acc: 0.9748 - val_loss: 0.0835 - val_acc: 0.9726
Epoch 10/50
60000/60000 [==============================] - 44s - loss: 0.0755 - acc: 0.9764 - val_loss: 0.0702 - val_acc: 0.9764
Epoch 11/50
60000/60000 [==============================] - 44s - loss: 0.0693 - acc: 0.9785 - val_loss: 0.0776 - val_acc: 0.9764
Epoch 12/50
60000/60000 [==============================] - 43s - loss: 0.0634 - acc: 0.9802 - val_loss: 0.0731 - val_acc: 0.9779
Epoch 13/50
60000/60000 [==============================] - 44s - loss: 0.0584 - acc: 0.9814 - val_loss: 0.0804 - val_acc: 0.9743
Epoch 14/50
60000/60000 [==============================] - 47s - loss: 0.0517 - acc: 0.9839 - val_loss: 0.0683 - val_acc: 0.9787
Epoch 15/50
60000/60000 [==============================] - 46s - loss: 0.0510 - acc: 0.9840 - val_loss: 0.0594 - val_acc: 0.9827
Epoch 16/50
60000/60000 [==============================] - 44s - loss: 0.0478 - acc: 0.9848 - val_loss: 0.0596 - val_acc: 0.9814
Epoch 17/50
60000/60000 [==============================] - 44s - loss: 0.0451 - acc: 0.9858 - val_loss: 0.0576 - val_acc: 0.9828
Epoch 18/50
60000/60000 [==============================] - 44s - loss: 0.0419 - acc: 0.9865 - val_loss: 0.0597 - val_acc: 0.9824
Epoch 19/50
60000/60000 [==============================] - 44s - loss: 0.0393 - acc: 0.9877 - val_loss: 0.0673 - val_acc: 0.9806
Epoch 20/50
60000/60000 [==============================] - 45s - loss: 0.0364 - acc: 0.9887 - val_loss: 0.0619 - val_acc: 0.9818
Epoch 21/50
60000/60000 [==============================] - 44s - loss: 0.0359 - acc: 0.9887 - val_loss: 0.0689 - val_acc: 0.9786
Epoch 22/50
60000/60000 [==============================] - 45s - loss: 0.0331 - acc: 0.9891 - val_loss: 0.0646 - val_acc: 0.9820
Epoch 23/50
60000/60000 [==============================] - 44s - loss: 0.0315 - acc: 0.9899 - val_loss: 0.0656 - val_acc: 0.9803
Epoch 24/50
60000/60000 [==============================] - 44s - loss: 0.0287 - acc: 0.9908 - val_loss: 0.0535 - val_acc: 0.9835
Epoch 25/50
60000/60000 [==============================] - 44s - loss: 0.0265 - acc: 0.9915 - val_loss: 0.0647 - val_acc: 0.9826
Epoch 26/50
60000/60000 [==============================] - 44s - loss: 0.0274 - acc: 0.9911 - val_loss: 0.0612 - val_acc: 0.9817
Epoch 27/50
60000/60000 [==============================] - 44s - loss: 0.0249 - acc: 0.9921 - val_loss: 0.0576 - val_acc: 0.9823.
Epoch 28/50
60000/60000 [==============================] - 45s - loss: 0.0244 - acc: 0.9921 - val_loss: 0.0551 - val_acc: 0.9835
Epoch 29/50
60000/60000 [==============================] - 48s - loss: 0.0227 - acc: 0.9927 - val_loss: 0.0559 - val_acc: 0.9840
Epoch 30/50
60000/60000 [==============================] - 45s - loss: 0.0216 - acc: 0.9932 - val_loss: 0.0537 - val_acc: 0.9838
Epoch 31/50
60000/60000 [==============================] - 44s - loss: 0.0181 - acc: 0.9940 - val_loss: 0.0582 - val_acc: 0.9852
Epoch 32/50
60000/60000 [==============================] - 44s - loss: 0.0193 - acc: 0.9935 - val_loss: 0.0504 - val_acc: 0.9863
Epoch 33/50
60000/60000 [==============================] - 45s - loss: 0.0165 - acc: 0.9946 - val_loss: 0.0528 - val_acc: 0.9851
Epoch 34/50
60000/60000 [==============================] - 44s - loss: 0.0173 - acc: 0.9940 - val_loss: 0.0631 - val_acc: 0.9839
Epoch 35/50
60000/60000 [==============================] - 46s - loss: 0.0171 - acc: 0.9943 - val_loss: 0.0646 - val_acc: 0.9816
Epoch 36/50
60000/60000 [==============================] - 44s - loss: 0.0152 - acc: 0.9948 - val_loss: 0.0580 - val_acc: 0.9870
Epoch 37/50
60000/60000 [==============================] - 44s - loss: 0.0163 - acc: 0.9950 - val_loss: 0.0652 - val_acc: 0.9837 ETA: 2s - loss: 0.0154 - acc: - ETA: 2s - loss: 
Epoch 38/50
60000/60000 [==============================] - 44s - loss: 0.0143 - acc: 0.9953 - val_loss: 0.0542 - val_acc: 0.9848
Epoch 39/50
60000/60000 [==============================] - 44s - loss: 0.0142 - acc: 0.9956 - val_loss: 0.0550 - val_acc: 0.9849
Epoch 40/50
60000/60000 [==============================] - 44s - loss: 0.0127 - acc: 0.9961 - val_loss: 0.0556 - val_acc: 0.9860
Epoch 41/50
60000/60000 [==============================] - 44s - loss: 0.0126 - acc: 0.9961 - val_loss: 0.0541 - val_acc: 0.9851
Epoch 42/50
60000/60000 [==============================] - 46s - loss: 0.0110 - acc: 0.9966 - val_loss: 0.0603 - val_acc: 0.9859
Epoch 43/50
60000/60000 [==============================] - 54s - loss: 0.0104 - acc: 0.9968 - val_loss: 0.0664 - val_acc: 0.9847
Epoch 44/50
60000/60000 [==============================] - 47s - loss: 0.0107 - acc: 0.9962 - val_loss: 0.0614 - val_acc: 0.9860
Epoch 45/50
60000/60000 [==============================] - 48s - loss: 0.0112 - acc: 0.9964 - val_loss: 0.0538 - val_acc: 0.9865
Epoch 46/50
60000/60000 [==============================] - 51s - loss: 0.0083 - acc: 0.9973 - val_loss: 0.0677 - val_acc: 0.9843
Epoch 47/50
60000/60000 [==============================] - 64s - loss: 0.0102 - acc: 0.9967 - val_loss: 0.0621 - val_acc: 0.9844
Epoch 48/50
60000/60000 [==============================] - 51s - loss: 0.0081 - acc: 0.9974 - val_loss: 0.0672 - val_acc: 0.9848
Epoch 49/50
60000/60000 [==============================] - 47s - loss: 0.0070 - acc: 0.9980 - val_loss: 0.0709 - val_acc: 0.9845
Epoch 50/50
60000/60000 [==============================] - 45s - loss: 0.0081 - acc: 0.9975 - val_loss: 0.0604 - val_acc: 0.9853
IRNN test score: 0.0603713211539
IRNN test accuracy: 0.9853

Time series

Now let’s take a look at how to handle time series, such as stock prices, air temperature, brain wave patterns, and so on. In this section we will train an RNN to predict the next value in a generated time series. Each training instance is a randomly selected sequence of 20 consecutive values from the time series, and the target sequence is the same as the input sequence, except it is shifted by one time step into the future.


In [51]:
t_min, t_max = 0, 30

n_steps = 20

def time_series(t):
    return t * np.sin(t) / 3 + 2 * np.sin(t*5)

def next_batch(batch_size, n_steps,resolution = 0.1):
    t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
    Ts = t0 + np.arange(0., n_steps + 1) * resolution
    ys = time_series(Ts)
    return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)

t = np.linspace(t_min, t_max, (t_max - t_min) // resolution)
t_instance = np.linspace(12.2, 12.2 + resolution * (n_steps + 1), n_steps + 1)

plt.figure(figsize=(11,4))
plt.subplot(121)
plt.title("A time series (generated)", fontsize=14)
plt.plot(t, time_series(t), label=r"$t . \sin(t) / 3 + 2 . \sin(5t)$")
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "b-", linewidth=3, label="A training instance")
plt.legend(loc="lower left", fontsize=14)
plt.axis([0, 30, -17, 13])
plt.xlabel("Time")
plt.ylabel("Value")

plt.subplot(122)
plt.title("A training instance", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()


C:\Users\gtesei\AppData\Local\Continuum\Miniconda3\lib\site-packages\ipykernel_launcher.py:14: DeprecationWarning: object of type <class 'float'> cannot be safely interpreted as an integer.
  

In [52]:
X_batch, y_batch = next_batch(1, n_steps)
np.c_[X_batch[0], y_batch[0]]


Out[52]:
array([[-3.0913747 , -1.81370459],
       [-1.81370459, -0.2545896 ],
       [-0.2545896 ,  1.35043982],
       [ 1.35043982,  2.75633863],
       [ 2.75633863,  3.76705941],
       [ 3.76705941,  4.28205029],
       [ 4.28205029,  4.31936936],
       [ 4.31936936,  4.00977195],
       [ 4.00977195,  3.56323364],
       [ 3.56323364,  3.2161207 ],
       [ 3.2161207 ,  3.17195748],
       [ 3.17195748,  3.55030744],
       [ 3.55030744,  4.35629752],
       [ 4.35629752,  5.47826041],
       [ 5.47826041,  6.71408441],
       [ 6.71408441,  7.81983055],
       [ 7.81983055,  8.56872404],
       [ 8.56872404,  8.80608565],
       [ 8.80608565,  8.48676122],
       [ 8.48676122,  7.68589142]])

Using an OuputProjectionWrapper


In [53]:
tf.reset_default_graph()

from tensorflow.contrib.layers import fully_connected

n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

cell = tf.contrib.rnn.OutputProjectionWrapper(
    tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu),
    output_size=n_outputs)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

n_outputs = 1
learning_rate = 0.001

loss = tf.reduce_sum(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

n_iterations = 1000
batch_size = 50

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        X_batch, y_batch = next_batch(batch_size, n_steps)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if iteration % 100 == 0:
            mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
            print(iteration, "\tMSE:", mse)
    
    X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
    y_pred = sess.run(outputs, feed_dict={X: X_new})
    print(y_pred)


0 	MSE: 20171.0
100 	MSE: 858.855
200 	MSE: 422.972
300 	MSE: 199.725
400 	MSE: 90.7315
500 	MSE: 56.7009
600 	MSE: 53.6289
700 	MSE: 62.3246
800 	MSE: 50.8053
900 	MSE: 53.6313
[[[-3.49738097]
  [-2.51607442]
  [-1.16632104]
  [ 0.60030717]
  [ 2.15503573]
  [ 3.01053429]
  [ 3.41778255]
  [ 3.32022381]
  [ 2.88544369]
  [ 2.23823047]
  [ 1.69417715]
  [ 1.57740271]
  [ 1.955423  ]
  [ 2.77756476]
  [ 3.91065073]
  [ 5.11356974]
  [ 6.10596704]
  [ 6.64041662]
  [ 6.67867994]
  [ 6.1244235 ]]]

In [55]:
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()


Without using an OutputProjectionWrapper


In [89]:
tf.reset_default_graph()

from tensorflow.contrib.layers import fully_connected

n_steps = 20
n_inputs = 1
n_neurons = 100

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
rnn_outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

n_outputs = 1
learning_rate = 0.001

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = fully_connected(stacked_rnn_outputs, n_outputs, activation_fn=None)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

n_iterations = 1000
batch_size = 50

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        X_batch, y_batch = next_batch(batch_size, n_steps)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if iteration % 100 == 0:
            mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
            print(iteration, "\tMSE:", mse)
    
    X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
    y_pred = sess.run(outputs, feed_dict={X: X_new})
    print(y_pred)


0 	MSE: 16.1839
100 	MSE: 0.396811
200 	MSE: 0.124991
300 	MSE: 0.0574633
400 	MSE: 0.0538617
500 	MSE: 0.059357
600 	MSE: 0.047703
700 	MSE: 0.0494709
800 	MSE: 0.0483082
900 	MSE: 0.0500153
[[[-3.46322608]
  [-2.48452091]
  [-1.10020792]
  [ 0.58717555]
  [ 2.00378942]
  [ 3.0813272 ]
  [ 3.52336788]
  [ 3.42467713]
  [ 2.84561419]
  [ 2.11835122]
  [ 1.66789067]
  [ 1.52136159]
  [ 1.86960983]
  [ 2.6718235 ]
  [ 3.84957361]
  [ 5.06223202]
  [ 6.07499313]
  [ 6.60482883]
  [ 6.601717  ]
  [ 5.99486113]]]

In [57]:
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()


With Keras


In [95]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import SimpleRNN
from keras import initializers
from keras.optimizers import RMSprop

from keras.models import Model
from keras.layers import Input, Dense

def ts_next_batch(batch_size, n_steps,resolution = 0.1):
    t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
    Ts = t0 + np.arange(0., n_steps + 1) * resolution
    ys = time_series(Ts)
    return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)

keras.backend.clear_session()

batch_size = 50
hidden_units = 100
learning_rate = 0.001
n_inputs = 1
n_outputs = 1
n_steps = 20

print('Evaluate IRNN...')
a = Input(shape=(n_steps,n_inputs))
b = SimpleRNN(hidden_units,
                    kernel_initializer=initializers.RandomNormal(stddev=0.001),
                    recurrent_initializer=initializers.Identity(),
                    activation='relu' ,  return_sequences=True)(a)
b = keras.layers.core.Reshape((-1,n_neurons))(b)
b = Dense(1,activation=None)(b)
b = keras.layers.core.Reshape((n_steps, n_outputs))(b)
optimizer = keras.optimizers.Adamax(lr=learning_rate)
model = Model(inputs=[a], outputs=[b])
model.compile(loss='mean_squared_error',
              optimizer=optimizer,
              metrics=['mean_squared_error'])

X_batch, y_batch = ts_next_batch(batch_size*1000, n_steps)
x_test, y_test = ts_next_batch(batch_size, n_steps)
model.fit(X_batch, y_batch,
          batch_size=batch_size,
          epochs=1,
          verbose=1,
          validation_data=(x_test, y_test))


Evaluate IRNN...
Train on 50000 samples, validate on 50 samples
Epoch 1/1
50000/50000 [==============================] - 13s - loss: 1.8889 - mean_squared_error: 1.8889 - val_loss: 0.4352 - val_mean_squared_error: 0.4352
Out[95]:
<keras.callbacks.History at 0x1aea8dd7b38>

In [96]:
X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = model.predict(X_new,verbose=0)
print(y_pred)


[[[-1.70384717]
  [-3.05007172]
  [-2.39496326]
  [-0.14787099]
  [ 1.67567396]
  [ 2.97915959]
  [ 3.60192847]
  [ 3.50552845]
  [ 2.90764904]
  [ 2.10811305]
  [ 1.44303656]
  [ 1.22440231]
  [ 1.58337164]
  [ 2.46381497]
  [ 3.68792629]
  [ 4.98927641]
  [ 6.08448458]
  [ 6.71567392]
  [ 6.7268734 ]
  [ 6.1052804 ]]]

In [97]:
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()


Dropout

If you build a very deep RNN, it may end up overfitting the training set. To prevent that, a common technique is to apply dropout. You can simply add a dropout layer before or after the RNN as usual, but if you also want to apply dropout between the RNN layers, you need to use a DropoutWrapper.


In [104]:
tf.reset_default_graph()
from tensorflow.contrib.layers import fully_connected

n_inputs = 1
n_neurons = 100
n_layers = 3
n_steps = 20
n_outputs = 1

keep_prob = 0.5
learning_rate = 0.001

is_training = True

def deep_rnn_with_dropout(X, y, is_training):
    if is_training:
        multi_layer_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicRNNCell(num_units=n_neurons), input_keep_prob=keep_prob) for _ in range(n_layers)],)
    else: 
        multi_layer_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicRNNCell(num_units=n_neurons) for _ in range(n_layers)],)
        
    rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

    stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
    stacked_outputs = fully_connected(stacked_rnn_outputs, n_outputs, activation_fn=None)
    outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

    loss = tf.reduce_mean(tf.square(outputs - y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

    return outputs, loss, training_op

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
outputs, loss, training_op = deep_rnn_with_dropout(X, y, is_training)
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_iterations = 2000
batch_size = 50

with tf.Session() as sess:
    if is_training:
        init.run()
        for iteration in range(n_iterations):
            X_batch, y_batch = next_batch(batch_size, n_steps)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            if iteration % 100 == 0:
                mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
                print(iteration, "\tMSE:", mse)
        save_path = saver.save(sess, "/tmp/my_model.ckpt")
    else:
        saver.restore(sess, "/tmp/my_model.ckpt")
        X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
        y_pred = sess.run(outputs, feed_dict={X: X_new})
        
        plt.title("Testing the model", fontsize=14)
        plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
        plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
        plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
        plt.legend(loc="upper left")
        plt.xlabel("Time")
        plt.show()


0 	MSE: 13.1851
100 	MSE: 5.1124
200 	MSE: 3.35787
300 	MSE: 3.95527
400 	MSE: 3.24614
500 	MSE: 4.12
600 	MSE: 3.30315
700 	MSE: 2.72216
800 	MSE: 2.72711
900 	MSE: 3.54013
1000 	MSE: 2.55609
1100 	MSE: 2.66153
1200 	MSE: 2.37055
1300 	MSE: 2.55441
1400 	MSE: 2.08881
1500 	MSE: 1.91476
1600 	MSE: 3.05383
1700 	MSE: 2.6395
1800 	MSE: 2.75546
1900 	MSE: 2.03541

In [105]:
is_training = False
with tf.Session() as sess:
    if is_training:
        init.run()
        for iteration in range(n_iterations):
            X_batch, y_batch = next_batch(batch_size, n_steps)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            if iteration % 100 == 0:
                mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
                print(iteration, "\tMSE:", mse)
        save_path = saver.save(sess, "/tmp/my_model.ckpt")
    else:
        saver.restore(sess, "/tmp/my_model.ckpt")
        X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
        y_pred = sess.run(outputs, feed_dict={X: X_new})
        
        plt.title("Testing the model", fontsize=14)
        plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
        plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
        plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
        plt.legend(loc="upper left")
        plt.xlabel("Time")
        plt.show()


INFO:tensorflow:Restoring parameters from /tmp/my_model.ckpt

Dropout with Keras


In [108]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import SimpleRNN
from keras import initializers
from keras.optimizers import RMSprop

from keras.models import Model
from keras.layers import Input, Dense

def ts_next_batch(batch_size, n_steps,resolution = 0.1):
    t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
    Ts = t0 + np.arange(0., n_steps + 1) * resolution
    ys = time_series(Ts)
    return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)

keras.backend.clear_session()

batch_size = 50
hidden_units = 100
learning_rate = 0.001
n_inputs = 1
n_outputs = 1
n_steps = 20

n_layers = 3
keep_prob = 0.5

print('Evaluate IRNN...')
a = Input(shape=(n_steps,n_inputs))
b = SimpleRNN(hidden_units,
                        kernel_initializer=initializers.RandomNormal(stddev=0.001),
                        recurrent_initializer=initializers.Identity(),
                        activation='relu' ,  return_sequences=True)(a)
b = Dropout(keep_prob)(b)
for i in range(n_layers-1): 
    b = SimpleRNN(hidden_units,
                        kernel_initializer=initializers.RandomNormal(stddev=0.001),
                        recurrent_initializer=initializers.Identity(),
                        activation='relu' ,  return_sequences=True)(a)
    b = Dropout(keep_prob)(b)
b = keras.layers.core.Reshape((-1,n_neurons))(b)
b = Dense(1,activation=None)(b)
b = keras.layers.core.Reshape((n_steps, n_outputs))(b)
optimizer = keras.optimizers.Adamax(lr=learning_rate)
model = Model(inputs=[a], outputs=[b])
model.compile(loss='mean_squared_error',
              optimizer=optimizer,
              metrics=['mean_squared_error'])

X_batch, y_batch = ts_next_batch(batch_size*2000, n_steps)
x_test, y_test = ts_next_batch(batch_size*2, n_steps)
model.fit(X_batch, y_batch,
          batch_size=batch_size,
          epochs=1,
          verbose=1,
          validation_data=(x_test, y_test))


Evaluate IRNN...
Train on 100000 samples, validate on 100 samples
Epoch 1/1
100000/100000 [==============================] - 34s - loss: 1.3340 - mean_squared_error: 1.3340 - val_loss: 0.1600 - val_mean_squared_error: 0.1600
Out[108]:
<keras.callbacks.History at 0x1aeaa2feeb8>

In [110]:
X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = model.predict(X_new,verbose=0)
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()


LSTM

The Long Short-Term Memory (LSTM) cell was proposed in (Hochreiter-Schmidhuber,1997), and it was gradually improved over the years by several researchers. If you consider the LSTM cell as a black box, it can be used very much like a basic cell, except it will perform much better; training will converge faster and it will detect long-term dependencies in the data.


In [113]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels

tf.reset_default_graph()

from tensorflow.contrib.layers import fully_connected

n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10

n_layers = 3 

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

multi_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons) for _ in range(n_layers)])
outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
top_layer_h_state = states[-1][1]
logits = fully_connected(top_layer_h_state, n_outputs, activation_fn=None, scope="softmax")
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((batch_size, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Epoch 0 Train accuracy = 0.966667 Test accuracy = 0.9582
Epoch 1 Train accuracy = 0.966667 Test accuracy = 0.9684
Epoch 2 Train accuracy = 0.986667 Test accuracy = 0.973
Epoch 3 Train accuracy = 0.986667 Test accuracy = 0.9803
Epoch 4 Train accuracy = 0.993333 Test accuracy = 0.9848
Epoch 5 Train accuracy = 0.973333 Test accuracy = 0.986
Epoch 6 Train accuracy = 1.0 Test accuracy = 0.9867
Epoch 7 Train accuracy = 0.993333 Test accuracy = 0.9862
Epoch 8 Train accuracy = 1.0 Test accuracy = 0.9878
Epoch 9 Train accuracy = 0.993333 Test accuracy = 0.9878

LSTM with Keras


In [124]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras import initializers
from keras.optimizers import RMSprop


from keras.models import Model
from keras.layers import Input, Dense

keras.backend.clear_session()

batch_size = 150
num_classes = 10
epochs = 10 
n_neurons = 150
n_layers = 3 
learning_rate = 0.001

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], 28, 28)
x_test = x_test.reshape(x_test.shape[0], 28, 28)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print('Evaluate LSTM...')
a = Input(shape=x_train.shape[1:])

b = LSTM(n_neurons,return_sequences=True)(a)
for i in range(n_layers-2):
    b = LSTM(n_neurons,return_sequences=True)(b)

b = LSTM(n_neurons,return_sequences=False)(b)

b = Dense(num_classes)(b)
b = Activation('softmax')(b)

optimizer = keras.optimizers.Adamax(lr=learning_rate)
model = Model(inputs=[a], outputs=[b])

model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

scores = model.evaluate(x_test, y_test, verbose=0)
print('LSTM test score:', scores[0])
print('LSTM test accuracy:', scores[1])


x_train shape: (60000, 28, 28)
60000 train samples
10000 test samples
Evaluate LSTM...
Train on 60000 samples, validate on 10000 samples
Epoch 1/10
60000/60000 [==============================] - 374s - loss: 0.6565 - acc: 0.7826 - val_loss: 0.2588 - val_acc: 0.9194
Epoch 2/10
60000/60000 [==============================] - 370s - loss: 0.1966 - acc: 0.9386 - val_loss: 0.1571 - val_acc: 0.9512
Epoch 3/10
60000/60000 [==============================] - 369s - loss: 0.1331 - acc: 0.9588 - val_loss: 0.1104 - val_acc: 0.9652
Epoch 4/10
60000/60000 [==============================] - 371s - loss: 0.1061 - acc: 0.9675 - val_loss: 0.0876 - val_acc: 0.9709
Epoch 5/10
60000/60000 [==============================] - 375s - loss: 0.0831 - acc: 0.9739 - val_loss: 0.0838 - val_acc: 0.9749
Epoch 6/10
60000/60000 [==============================] - 356s - loss: 0.0696 - acc: 0.9782 - val_loss: 0.0747 - val_acc: 0.9769
Epoch 7/10
60000/60000 [==============================] - 379s - loss: 0.0587 - acc: 0.9817 - val_loss: 0.0664 - val_acc: 0.9783
Epoch 8/10
60000/60000 [==============================] - 341s - loss: 0.0494 - acc: 0.9850 - val_loss: 0.0590 - val_acc: 0.9813
Epoch 9/10
60000/60000 [==============================] - 338s - loss: 0.0417 - acc: 0.9868 - val_loss: 0.0549 - val_acc: 0.9827
Epoch 10/10
60000/60000 [==============================] - 347s - loss: 0.0349 - acc: 0.9892 - val_loss: 0.0586 - val_acc: 0.9810
LSTM test score: 0.0585878778149
LSTM test accuracy: 0.981

Distributing layers across devices

If you try to create each cell in a different device() block, it will not work.


In [128]:
with tf.device("/gpu:0"): # BAD! This is ignored. 
    layer1 = tf.contrib.rnn.BasicRNNCell( num_units = n_neurons) 
    
with tf.device("/gpu:1"): # BAD! Ignored again. 
    layer2 = tf.contrib.rnn.BasicRNNCell( num_units = n_neurons)

This fails because a BasicRNNCell is a cell factory, not a cell per se; no cells get created when you create the factory, and thus no variables do either. The device block is simply ignored. The cells actually get created later. When you call dynamic_rnn(), it calls the MultiRNNCell, which calls each individual BasicRNNCell, which create the actual cells (including their variables). Unfortunately, none of these classes provide any way to control the devices on which the variables get created. If you try to put the dynamic_rnn() call within a device block, the whole RNN gets pinned to a single device.

The trick is to create your own cell wrapper


In [129]:
import tensorflow as tf

class DeviceCellWrapper(tf.contrib.rnn.RNNCell):
  def __init__(self, device, cell):
    self._cell = cell
    self._device = device

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def __call__(self, inputs, state, scope=None):
    with tf.device(self._device):
        return self._cell(inputs, state, scope)
    
tf.reset_default_graph()

n_inputs = 5
n_neurons = 100
devices = ["/cpu:0"]*5
n_steps = 20
X = tf.placeholder(tf.float32, shape=[None, n_steps, n_inputs])
lstm_cells = [DeviceCellWrapper(device, tf.contrib.rnn.BasicRNNCell(num_units=n_neurons))
              for device in devices]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    print(sess.run(outputs, feed_dict={X: rnd.rand(2, n_steps, n_inputs)}))


[[[ 0.01774996 -0.00240297 -0.06099286 ...,  0.01896067  0.0262568
   -0.10385772]
  [-0.01140774  0.07944129  0.09548181 ...,  0.17136577  0.07142816
    0.02270918]
  [ 0.01077356 -0.19770344  0.03775163 ...,  0.16440447 -0.03113544
   -0.15833262]
  ..., 
  [ 0.18550645 -0.045463   -0.54991585 ...,  0.06169901 -0.02102067
   -0.13856457]
  [ 0.00486014  0.19622976  0.08843058 ...,  0.06885004  0.170504
   -0.2080622 ]
  [ 0.07338118 -0.52152467 -0.42612541 ...,  0.47807157 -0.01505137
   -0.56541729]]

 [[ 0.0101305  -0.04053771 -0.00590748 ...,  0.01823444  0.04402071
   -0.0789765 ]
  [ 0.04542742  0.03866118  0.08369439 ...,  0.09143315  0.0773553
    0.03683787]
  [ 0.15722416 -0.07377757  0.06061554 ...,  0.22006992  0.00631647
   -0.26226565]
  ..., 
  [-0.39800081  0.3084473  -0.19721624 ...,  0.2082005  -0.04525452
   -0.24870294]
  [-0.50827909  0.06136992 -0.232437   ...,  0.33199015 -0.07412967
   -0.4881908 ]
  [-0.28511611  0.12590477 -0.36025527 ...,  0.5273127   0.08071721
   -0.18075247]]]

Bidirectional LSTM on the IMDB sentiment classification task on Keras


In [138]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

del model
keras.backend.clear_session()

max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])


Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 277s - loss: 0.4149 - acc: 0.8074 - val_loss: 0.3521 - val_acc: 0.8450
Epoch 2/4
25000/25000 [==============================] - 249s - loss: 0.2238 - acc: 0.9139 - val_loss: 0.3632 - val_acc: 0.8469
Epoch 3/4
25000/25000 [==============================] - 262s - loss: 0.1290 - acc: 0.9532 - val_loss: 0.4287 - val_acc: 0.8361
Epoch 4/4
25000/25000 [==============================] - 279s - loss: 0.0697 - acc: 0.9765 - val_loss: 0.6076 - val_acc: 0.8346
Out[138]:
<keras.callbacks.History at 0x1aed46bcc88>

LSTM on the IMDB sentiment classification task on Keras


In [162]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

del model
keras.backend.clear_session()

max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])


Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 182s - loss: 0.4149 - acc: 0.8062 - val_loss: 0.3396 - val_acc: 0.8514
Epoch 2/4
25000/25000 [==============================] - 169s - loss: 0.2322 - acc: 0.9110 - val_loss: 0.4239 - val_acc: 0.8191
Epoch 3/4
25000/25000 [==============================] - 171s - loss: 0.1457 - acc: 0.9482 - val_loss: 0.4668 - val_acc: 0.8398
Epoch 4/4
25000/25000 [==============================] - 168s - loss: 0.1000 - acc: 0.9656 - val_loss: 0.5299 - val_acc: 0.8356
Out[162]:
<keras.callbacks.History at 0x1aed2933c50>

LSTM+FC on the IMDB sentiment classification task on Keras


In [161]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

del model
keras.backend.clear_session()

max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])


Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 163s - loss: 0.4247 - acc: 0.8021 - val_loss: 0.3404 - val_acc: 0.8504
Epoch 2/4
25000/25000 [==============================] - 157s - loss: 0.2330 - acc: 0.9117 - val_loss: 0.4124 - val_acc: 0.8258
Epoch 3/4
25000/25000 [==============================] - 156s - loss: 0.1394 - acc: 0.9501 - val_loss: 0.4462 - val_acc: 0.8374
Epoch 4/4
25000/25000 [==============================] - 156s - loss: 0.0926 - acc: 0.9672 - val_loss: 0.5852 - val_acc: 0.8339
Out[161]:
<keras.callbacks.History at 0x1aed0102c88>

Recurrent convolutional network on the IMDB sentiment


In [159]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb

del model
keras.backend.clear_session()

# Embedding
max_features = 20000
maxlen = 100
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 4

'''
Note:
batch_size is highly sensitive.
Only 2 epochs are needed as the dataset is very small.
'''

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Build model...
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 116s - loss: 0.3830 - acc: 0.8228 - val_loss: 0.3617 - val_acc: 0.8458
Epoch 2/4
25000/25000 [==============================] - 121s - loss: 0.1969 - acc: 0.9251 - val_loss: 0.3810 - val_acc: 0.8485
Epoch 3/4
25000/25000 [==============================] - 110s - loss: 0.0965 - acc: 0.9669 - val_loss: 0.4206 - val_acc: 0.8420
Epoch 4/4
25000/25000 [==============================] - 113s - loss: 0.0444 - acc: 0.9858 - val_loss: 0.5578 - val_acc: 0.8397
24990/25000 [============================>.] - ETA: 0sTest score: 0.557802463364
Test accuracy: 0.839679995275

Convolutional network on the IMDB sentiment


In [158]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

keras.backend.clear_session()
del model

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 4

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))


Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 307s - loss: 0.4115 - acc: 0.7988 - val_loss: 0.2973 - val_acc: 0.8733
Epoch 2/4
25000/25000 [==============================] - 312s - loss: 0.2450 - acc: 0.9002 - val_loss: 0.2842 - val_acc: 0.8824
Epoch 3/4
25000/25000 [==============================] - 314s - loss: 0.1767 - acc: 0.9320 - val_loss: 0.2847 - val_acc: 0.8828
Epoch 4/4
25000/25000 [==============================] - 342s - loss: 0.1240 - acc: 0.9539 - val_loss: 0.3543 - val_acc: 0.8720
Out[158]:
<keras.callbacks.History at 0x1ae821e86d8>

IMDB datasets with bi-gram embeddings


In [164]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb

keras.backend.clear_session()
del model

def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 2
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))


Loading data...
25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230
Adding 2-gram features
Average train sequence length: 476
Average test sequence length: 428
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 928s - loss: 0.5806 - acc: 0.7869 - val_loss: 0.4315 - val_acc: 0.8599
Epoch 2/5
25000/25000 [==============================] - 930s - loss: 0.2768 - acc: 0.9310 - val_loss: 0.2990 - val_acc: 0.8939
Epoch 3/5
25000/25000 [==============================] - 958s - loss: 0.1363 - acc: 0.9718 - val_loss: 0.2601 - val_acc: 0.9014
Epoch 4/5
25000/25000 [==============================] - 947s - loss: 0.0733 - acc: 0.9887 - val_loss: 0.2427 - val_acc: 0.9040
Epoch 5/5
25000/25000 [==============================] - 896s - loss: 0.0411 - acc: 0.9952 - val_loss: 0.2350 - val_acc: 0.9066
Out[164]:
<keras.callbacks.History at 0x1aee66cfe80>

IMDB datasets with bi-gram embeddings and Convolution1D


In [168]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb

keras.backend.clear_session()
del model

def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 2
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))


Loading data...
25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230
Adding 2-gram features
Average train sequence length: 476
Average test sequence length: 428
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 1035s - loss: 0.4344 - acc: 0.7766 - val_loss: 0.2947 - val_acc: 0.8744
Epoch 2/5
25000/25000 [==============================] - 1020s - loss: 0.1569 - acc: 0.9422 - val_loss: 0.3221 - val_acc: 0.8711
Epoch 3/5
25000/25000 [==============================] - 1070s - loss: 0.0193 - acc: 0.9945 - val_loss: 0.4046 - val_acc: 0.8686
Epoch 4/5
25000/25000 [==============================] - 1206s - loss: 0.0015 - acc: 0.9998 - val_loss: 0.4027 - val_acc: 0.8808
Epoch 5/5
25000/25000 [==============================] - 1223s - loss: 1.5838e-04 - acc: 1.0000 - val_loss: 0.4271 - val_acc: 0.8803
Out[168]:
<keras.callbacks.History at 0x1aef9ff7c88>