Chapter 14 – Recurrent Neural Networks

This notebook contains all the sample code and solutions to the exercices in chapter 14.

Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:


In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rnn"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

Then of course we will need TensorFlow:


In [2]:
import tensorflow as tf

Basic RNNs

Manual RNN


In [3]:
reset_graph()

n_inputs = 3
n_neurons = 5

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons],dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons,n_neurons],dtype=tf.float32))
b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))

Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)

init = tf.global_variables_initializer()

In [4]:
import numpy as np

X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})

In [5]:
print(Y0_val)


[[-0.0664006   0.96257669  0.68105787  0.70918542 -0.89821595]
 [ 0.9977755  -0.71978885 -0.99657625  0.9673925  -0.99989718]
 [ 0.99999774 -0.99898815 -0.99999893  0.99677622 -0.99999988]
 [ 1.         -1.         -1.         -0.99818915  0.99950868]]

In [6]:
print(Y1_val)


[[ 1.         -1.         -1.          0.40200216 -1.        ]
 [-0.12210433  0.62805319  0.96718419 -0.99371207 -0.25839335]
 [ 0.99999827 -0.9999994  -0.9999975  -0.85943311 -0.9999879 ]
 [ 0.99928284 -0.99999815 -0.99990582  0.98579615 -0.92205751]]

Using static_rnn()


In [7]:
n_inputs = 3
n_neurons = 5

In [8]:
reset_graph()

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, [X0, X1],
                                                dtype=tf.float32)
Y0, Y1 = output_seqs

In [9]:
init = tf.global_variables_initializer()

In [10]:
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]])
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]])

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})

In [11]:
Y0_val


Out[11]:
array([[-0.81393629, -0.43182844, -0.40150994,  0.7043609 ,  0.89640522],
       [-0.9915663 , -0.95103657,  0.19996507,  0.98335052,  0.99998963],
       [-0.99965042, -0.99683058,  0.68092704,  0.99918783,  1.        ],
       [ 0.64988363, -0.16740513,  0.99994725,  0.81680971,  0.99995029]], dtype=float32)

In [12]:
Y1_val


Out[12]:
array([[-0.99959785, -0.99861717,  0.98714638,  0.99745673,  1.        ],
       [-0.72472596,  0.17925572,  0.53362155, -0.65215266, -0.08035918],
       [-0.9957462 , -0.96851194,  0.9874723 ,  0.84106421,  0.99999976],
       [-0.72859728, -0.27958852,  0.80567408, -0.20587993,  0.9995411 ]], dtype=float32)

In [13]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "b<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [14]:
show_graph(tf.get_default_graph())


Packing sequences


In [15]:
n_steps = 2
n_inputs = 3
n_neurons = 5

In [16]:
reset_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, X_seqs,
                                                dtype=tf.float32)
outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2])

In [17]:
init = tf.global_variables_initializer()

In [18]:
X_batch = np.array([
        # t = 0      t = 1 
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])

with tf.Session() as sess:
    init.run()
    outputs_val = outputs.eval(feed_dict={X: X_batch})

In [19]:
print(outputs_val)


[[[-0.91279727  0.83698678 -0.89277941  0.80308062 -0.5283336 ]
  [-1.          1.         -0.99794829  0.99985468 -0.99273592]]

 [[-0.99994391  0.99951613 -0.9946925   0.99030769 -0.94413054]
  [ 0.48733309  0.93389565 -0.31362072  0.88573611  0.2424476 ]]

 [[-1.          0.99999875 -0.99975014  0.99956584 -0.99466234]
  [-0.99994856  0.99999434 -0.96058172  0.99784708 -0.9099462 ]]

 [[-0.95972425  0.99951482  0.96938795 -0.969908   -0.67668229]
  [-0.84596014  0.96288228  0.96856463 -0.14777924 -0.9119423 ]]]

In [20]:
print(np.transpose(outputs_val, axes=[1, 0, 2])[1])


[[-1.          1.         -0.99794829  0.99985468 -0.99273592]
 [ 0.48733309  0.93389565 -0.31362072  0.88573611  0.2424476 ]
 [-0.99994856  0.99999434 -0.96058172  0.99784708 -0.9099462 ]
 [-0.84596014  0.96288228  0.96856463 -0.14777924 -0.9119423 ]]

Using dynamic_rnn()


In [21]:
n_steps = 2
n_inputs = 3
n_neurons = 5

In [22]:
reset_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

In [23]:
init = tf.global_variables_initializer()

In [24]:
X_batch = np.array([
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])

with tf.Session() as sess:
    init.run()
    outputs_val = outputs.eval(feed_dict={X: X_batch})

In [25]:
print(outputs_val)


[[[ 0.90414059  0.49652389 -0.86023885  0.39286929 -0.30018684]
  [ 0.99999994  0.76327085 -1.          0.99888641 -0.7229408 ]]

 [[ 0.99988353  0.77785885 -0.99992859  0.9727248  -0.78886396]
  [ 0.44762579 -0.06916652 -0.51665425 -0.84579295  0.88807124]]

 [[ 0.99999976  0.91130525 -0.99999994  0.99912328 -0.94954252]
  [ 0.9999842   0.20443429 -0.99999785  0.94190502  0.3501083 ]]

 [[ 0.99490303  0.88642204 -0.99999577  0.99939179  0.97382319]
  [ 0.95951742  0.73643577 -0.99815822 -0.26513484  0.06432986]]]

In [26]:
show_graph(tf.get_default_graph())


Setting the sequence lengths


In [27]:
n_steps = 2
n_inputs = 3
n_neurons = 5

reset_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)

In [28]:
seq_length = tf.placeholder(tf.int32, [None])
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32,
                                    sequence_length=seq_length)

In [29]:
init = tf.global_variables_initializer()

In [30]:
X_batch = np.array([
        # step 0     step 1
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2 (padded with zero vectors)
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])
seq_length_batch = np.array([2, 1, 2, 2])

In [31]:
with tf.Session() as sess:
    init.run()
    outputs_val, states_val = sess.run(
        [outputs, states], feed_dict={X: X_batch, seq_length: seq_length_batch})

In [32]:
print(outputs_val)


[[[-0.68579948 -0.25901747 -0.80249101 -0.18141513 -0.37491536]
  [-0.99996698 -0.94501185  0.98072106 -0.9689762   0.99966913]]

 [[-0.99099374 -0.64768541 -0.67801034 -0.7415446   0.7719509 ]
  [ 0.          0.          0.          0.          0.        ]]

 [[-0.99978048 -0.85583007 -0.49696958 -0.93838578  0.98505187]
  [-0.99951065 -0.89148796  0.94170523 -0.38407657  0.97499216]]

 [[-0.02052618 -0.94588047  0.99935204  0.37283331  0.9998163 ]
  [-0.91052347  0.05769409  0.47446665 -0.44611037  0.89394671]]]

In [33]:
print(states_val)


[[-0.99996698 -0.94501185  0.98072106 -0.9689762   0.99966913]
 [-0.99099374 -0.64768541 -0.67801034 -0.7415446   0.7719509 ]
 [-0.99951065 -0.89148796  0.94170523 -0.38407657  0.97499216]
 [-0.91052347  0.05769409  0.47446665 -0.44611037  0.89394671]]

Training a sequence classifier

Note: the book uses tensorflow.contrib.layers.fully_connected() rather than tf.layers.dense() (which did not exist when this chapter was written). It is now preferable to use tf.layers.dense(), because anything in the contrib module may change or be deleted without notice. The dense() function is almost identical to the fully_connected() function. The main differences relevant to this chapter are:

  • several parameters are renamed: scope becomes name, activation_fn becomes activation (and similarly the _fn suffix is removed from other parameters such as normalizer_fn), weights_initializer becomes kernel_initializer, etc.
  • the default activation is now None rather than tf.nn.relu.

In [34]:
reset_graph()

n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

logits = tf.layers.dense(states, n_outputs)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                          logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

In [35]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz

In [36]:
n_epochs = 100
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((-1, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)


0 Train accuracy: 0.94 Test accuracy: 0.9308
1 Train accuracy: 0.933333 Test accuracy: 0.9431
2 Train accuracy: 0.94 Test accuracy: 0.9531
3 Train accuracy: 0.96 Test accuracy: 0.9616
4 Train accuracy: 0.96 Test accuracy: 0.9641
5 Train accuracy: 0.966667 Test accuracy: 0.968
6 Train accuracy: 0.98 Test accuracy: 0.9616
7 Train accuracy: 0.98 Test accuracy: 0.9678
8 Train accuracy: 0.98 Test accuracy: 0.9691
9 Train accuracy: 0.973333 Test accuracy: 0.9734
10 Train accuracy: 0.993333 Test accuracy: 0.9732
11 Train accuracy: 0.966667 Test accuracy: 0.9747
12 Train accuracy: 0.986667 Test accuracy: 0.9747
13 Train accuracy: 0.98 Test accuracy: 0.9697
14 Train accuracy: 0.973333 Test accuracy: 0.9756
15 Train accuracy: 0.993333 Test accuracy: 0.9773
16 Train accuracy: 0.993333 Test accuracy: 0.9738
17 Train accuracy: 0.986667 Test accuracy: 0.9727
18 Train accuracy: 0.986667 Test accuracy: 0.9797
19 Train accuracy: 0.986667 Test accuracy: 0.9748
20 Train accuracy: 0.98 Test accuracy: 0.9723
21 Train accuracy: 0.986667 Test accuracy: 0.9779
22 Train accuracy: 0.98 Test accuracy: 0.9764
23 Train accuracy: 0.98 Test accuracy: 0.9732
24 Train accuracy: 0.993333 Test accuracy: 0.9784
25 Train accuracy: 0.993333 Test accuracy: 0.9785
26 Train accuracy: 0.986667 Test accuracy: 0.9801
27 Train accuracy: 0.98 Test accuracy: 0.9661
28 Train accuracy: 0.973333 Test accuracy: 0.9819
29 Train accuracy: 1.0 Test accuracy: 0.9772
30 Train accuracy: 0.986667 Test accuracy: 0.9784
31 Train accuracy: 0.993333 Test accuracy: 0.9752
32 Train accuracy: 0.98 Test accuracy: 0.972
33 Train accuracy: 1.0 Test accuracy: 0.9809
34 Train accuracy: 0.993333 Test accuracy: 0.9775
35 Train accuracy: 0.973333 Test accuracy: 0.9736
36 Train accuracy: 0.993333 Test accuracy: 0.977
37 Train accuracy: 1.0 Test accuracy: 0.9817
38 Train accuracy: 0.993333 Test accuracy: 0.9723
39 Train accuracy: 0.993333 Test accuracy: 0.9769
40 Train accuracy: 0.993333 Test accuracy: 0.9791
41 Train accuracy: 0.993333 Test accuracy: 0.9787
42 Train accuracy: 0.986667 Test accuracy: 0.9821
43 Train accuracy: 0.993333 Test accuracy: 0.9777
44 Train accuracy: 0.986667 Test accuracy: 0.975
45 Train accuracy: 0.986667 Test accuracy: 0.98
46 Train accuracy: 0.986667 Test accuracy: 0.9786
47 Train accuracy: 0.993333 Test accuracy: 0.9809
48 Train accuracy: 0.973333 Test accuracy: 0.9787
49 Train accuracy: 0.986667 Test accuracy: 0.9815
50 Train accuracy: 1.0 Test accuracy: 0.9774
51 Train accuracy: 0.98 Test accuracy: 0.9713
52 Train accuracy: 1.0 Test accuracy: 0.9803
53 Train accuracy: 0.993333 Test accuracy: 0.9789
54 Train accuracy: 1.0 Test accuracy: 0.9805
55 Train accuracy: 1.0 Test accuracy: 0.9786
56 Train accuracy: 0.986667 Test accuracy: 0.9758
57 Train accuracy: 0.993333 Test accuracy: 0.9788
58 Train accuracy: 0.98 Test accuracy: 0.9811
59 Train accuracy: 0.986667 Test accuracy: 0.9765
60 Train accuracy: 1.0 Test accuracy: 0.979
61 Train accuracy: 0.993333 Test accuracy: 0.976
62 Train accuracy: 0.993333 Test accuracy: 0.9787
63 Train accuracy: 0.98 Test accuracy: 0.977
64 Train accuracy: 0.993333 Test accuracy: 0.9822
65 Train accuracy: 0.993333 Test accuracy: 0.9719
66 Train accuracy: 1.0 Test accuracy: 0.9782
67 Train accuracy: 0.986667 Test accuracy: 0.9788
68 Train accuracy: 0.993333 Test accuracy: 0.9807
69 Train accuracy: 1.0 Test accuracy: 0.978
70 Train accuracy: 0.973333 Test accuracy: 0.9806
71 Train accuracy: 1.0 Test accuracy: 0.9786
72 Train accuracy: 0.993333 Test accuracy: 0.9782
73 Train accuracy: 0.986667 Test accuracy: 0.976
74 Train accuracy: 1.0 Test accuracy: 0.9784
75 Train accuracy: 0.993333 Test accuracy: 0.9758
76 Train accuracy: 0.986667 Test accuracy: 0.9779
77 Train accuracy: 1.0 Test accuracy: 0.9741
78 Train accuracy: 0.986667 Test accuracy: 0.9737
79 Train accuracy: 0.986667 Test accuracy: 0.9754
80 Train accuracy: 0.986667 Test accuracy: 0.98
81 Train accuracy: 0.986667 Test accuracy: 0.9807
82 Train accuracy: 0.993333 Test accuracy: 0.979
83 Train accuracy: 1.0 Test accuracy: 0.979
84 Train accuracy: 0.993333 Test accuracy: 0.9752
85 Train accuracy: 0.993333 Test accuracy: 0.9775
86 Train accuracy: 0.986667 Test accuracy: 0.975
87 Train accuracy: 0.993333 Test accuracy: 0.9763
88 Train accuracy: 0.993333 Test accuracy: 0.972
89 Train accuracy: 1.0 Test accuracy: 0.9782
90 Train accuracy: 1.0 Test accuracy: 0.9795
91 Train accuracy: 0.986667 Test accuracy: 0.9742
92 Train accuracy: 0.986667 Test accuracy: 0.9775
93 Train accuracy: 0.986667 Test accuracy: 0.9803
94 Train accuracy: 1.0 Test accuracy: 0.9806
95 Train accuracy: 0.993333 Test accuracy: 0.977
96 Train accuracy: 0.993333 Test accuracy: 0.9781
97 Train accuracy: 0.993333 Test accuracy: 0.9751
98 Train accuracy: 0.98 Test accuracy: 0.9794
99 Train accuracy: 1.0 Test accuracy: 0.9804

Multi-layer RNN


In [37]:
reset_graph()

n_steps = 28
n_inputs = 28
n_outputs = 10

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

In [38]:
n_neurons = 100
n_layers = 3

layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons,
                                      activation=tf.nn.relu)
          for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

In [39]:
states_concat = tf.concat(axis=1, values=states)
logits = tf.layers.dense(states_concat, n_outputs)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

In [40]:
n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((-1, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)


0 Train accuracy: 0.96 Test accuracy: 0.9418
1 Train accuracy: 0.98 Test accuracy: 0.9686
2 Train accuracy: 0.94 Test accuracy: 0.9693
3 Train accuracy: 0.973333 Test accuracy: 0.9715
4 Train accuracy: 0.986667 Test accuracy: 0.9758
5 Train accuracy: 0.993333 Test accuracy: 0.9774
6 Train accuracy: 0.993333 Test accuracy: 0.9815
7 Train accuracy: 1.0 Test accuracy: 0.9765
8 Train accuracy: 0.986667 Test accuracy: 0.9831
9 Train accuracy: 0.986667 Test accuracy: 0.9804

Time series


In [41]:
t_min, t_max = 0, 30
resolution = 0.1

def time_series(t):
    return t * np.sin(t) / 3 + 2 * np.sin(t*5)

def next_batch(batch_size, n_steps):
    t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
    Ts = t0 + np.arange(0., n_steps + 1) * resolution
    ys = time_series(Ts)
    return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)

In [42]:
t = np.linspace(t_min, t_max, int((t_max - t_min) / resolution))

n_steps = 20
t_instance = np.linspace(12.2, 12.2 + resolution * (n_steps + 1), n_steps + 1)

plt.figure(figsize=(11,4))
plt.subplot(121)
plt.title("A time series (generated)", fontsize=14)
plt.plot(t, time_series(t), label=r"$t . \sin(t) / 3 + 2 . \sin(5t)$")
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "b-", linewidth=3, label="A training instance")
plt.legend(loc="lower left", fontsize=14)
plt.axis([0, 30, -17, 13])
plt.xlabel("Time")
plt.ylabel("Value")

plt.subplot(122)
plt.title("A training instance", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.legend(loc="upper left")
plt.xlabel("Time")


save_fig("time_series_plot")
plt.show()


Saving figure time_series_plot

In [43]:
X_batch, y_batch = next_batch(1, n_steps)

In [44]:
np.c_[X_batch[0], y_batch[0]]


Out[44]:
array([[ 1.38452097,  2.05081182],
       [ 2.05081182,  2.29742291],
       [ 2.29742291,  2.0465599 ],
       [ 2.0465599 ,  1.34009916],
       [ 1.34009916,  0.32948704],
       [ 0.32948704, -0.76115235],
       [-0.76115235, -1.68967022],
       [-1.68967022, -2.25492776],
       [-2.25492776, -2.34576159],
       [-2.34576159, -1.96789418],
       [-1.96789418, -1.24220428],
       [-1.24220428, -0.37478448],
       [-0.37478448,  0.39387907],
       [ 0.39387907,  0.84815766],
       [ 0.84815766,  0.85045064],
       [ 0.85045064,  0.3752526 ],
       [ 0.3752526 , -0.48422846],
       [-0.48422846, -1.53852738],
       [-1.53852738, -2.54795941],
       [-2.54795941, -3.28097239]])

Using an OuputProjectionWrapper

Let's create the RNN. It will contain 100 recurrent neurons and we will unroll it over 20 time steps since each traiing instance will be 20 inputs long. Each input will contain only one feature (the value at that time). The targets are also sequences of 20 inputs, each containing a sigle value:


In [45]:
reset_graph()

n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

At each time step we now have an output vector of size 100. But what we actually want is a single output value at each time step. The simplest solution is to wrap the cell in an OutputProjectionWrapper.


In [46]:
reset_graph()

n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

In [47]:
cell = tf.contrib.rnn.OutputProjectionWrapper(
    tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu),
    output_size=n_outputs)

In [48]:
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

In [49]:
learning_rate = 0.001

loss = tf.reduce_mean(tf.square(outputs - y)) # MSE
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

In [50]:
saver = tf.train.Saver()

In [51]:
n_iterations = 1500
batch_size = 50

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        X_batch, y_batch = next_batch(batch_size, n_steps)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if iteration % 100 == 0:
            mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
            print(iteration, "\tMSE:", mse)
    
    saver.save(sess, "./my_time_series_model") # not shown in the book


0 	MSE: 13.6543
100 	MSE: 0.538476
200 	MSE: 0.168532
300 	MSE: 0.0879579
400 	MSE: 0.0633425
500 	MSE: 0.061859
600 	MSE: 0.0558801
700 	MSE: 0.0498718
800 	MSE: 0.0518417
900 	MSE: 0.0482838
1000 	MSE: 0.0483549
1100 	MSE: 0.0503321
1200 	MSE: 0.0412116
1300 	MSE: 0.0488435
1400 	MSE: 0.0426057

In [52]:
with tf.Session() as sess:                          # not shown in the book
    saver.restore(sess, "./my_time_series_model")   # not shown

    X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
    y_pred = sess.run(outputs, feed_dict={X: X_new})


INFO:tensorflow:Restoring parameters from ./my_time_series_model

In [53]:
y_pred


Out[53]:
array([[[-3.42596436],
        [-2.48950148],
        [-1.1358937 ],
        [ 0.75142008],
        [ 2.19939661],
        [ 3.14104176],
        [ 3.54801917],
        [ 3.34113908],
        [ 2.82566142],
        [ 2.17759967],
        [ 1.65191436],
        [ 1.55619645],
        [ 1.94783175],
        [ 2.74632907],
        [ 3.89091802],
        [ 5.11678171],
        [ 6.13101864],
        [ 6.67043686],
        [ 6.62354612],
        [ 6.05428839]]], dtype=float32)

In [54]:
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")

save_fig("time_series_pred_plot")
plt.show()


Saving figure time_series_pred_plot

Without using an OutputProjectionWrapper


In [55]:
reset_graph()

n_steps = 20
n_inputs = 1
n_neurons = 100

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

In [56]:
cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
rnn_outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

In [57]:
n_outputs = 1
learning_rate = 0.001

In [58]:
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

In [59]:
loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [60]:
n_iterations = 1500
batch_size = 50

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        X_batch, y_batch = next_batch(batch_size, n_steps)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if iteration % 100 == 0:
            mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
            print(iteration, "\tMSE:", mse)
    
    X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
    y_pred = sess.run(outputs, feed_dict={X: X_new})
    
    saver.save(sess, "./my_time_series_model")


0 	MSE: 11.6768
100 	MSE: 0.51119
200 	MSE: 0.14452
300 	MSE: 0.0760974
400 	MSE: 0.063713
500 	MSE: 0.0601674
600 	MSE: 0.0531676
700 	MSE: 0.0493623
800 	MSE: 0.0519282
900 	MSE: 0.0482475
1000 	MSE: 0.048083
1100 	MSE: 0.0484352
1200 	MSE: 0.0418098
1300 	MSE: 0.0477387
1400 	MSE: 0.0419062

In [61]:
y_pred


Out[61]:
array([[[-3.42077947],
        [-2.47134852],
        [-1.14368439],
        [ 0.75839251],
        [ 2.15983796],
        [ 3.11996722],
        [ 3.52640414],
        [ 3.43011165],
        [ 2.8376286 ],
        [ 2.18515253],
        [ 1.6659894 ],
        [ 1.54036307],
        [ 1.89834416],
        [ 2.73356843],
        [ 3.9192028 ],
        [ 5.16150093],
        [ 6.10899305],
        [ 6.66055822],
        [ 6.65600348],
        [ 6.09106874]]], dtype=float32)

In [62]:
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")

plt.show()


Generating a creative new sequence


In [63]:
with tf.Session() as sess:                        # not shown in the book
    saver.restore(sess, "./my_time_series_model") # not shown

    sequence = [0.] * n_steps
    for iteration in range(300):
        X_batch = np.array(sequence[-n_steps:]).reshape(1, n_steps, 1)
        y_pred = sess.run(outputs, feed_dict={X: X_batch})
        sequence.append(y_pred[0, -1, 0])


INFO:tensorflow:Restoring parameters from ./my_time_series_model

In [64]:
plt.figure(figsize=(8,4))
plt.plot(np.arange(len(sequence)), sequence, "b-")
plt.plot(t[:n_steps], sequence[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
plt.ylabel("Value")
plt.show()



In [65]:
with tf.Session() as sess:
    saver.restore(sess, "./my_time_series_model")

    sequence1 = [0. for i in range(n_steps)]
    for iteration in range(len(t) - n_steps):
        X_batch = np.array(sequence1[-n_steps:]).reshape(1, n_steps, 1)
        y_pred = sess.run(outputs, feed_dict={X: X_batch})
        sequence1.append(y_pred[0, -1, 0])

    sequence2 = [time_series(i * resolution + t_min + (t_max-t_min/3)) for i in range(n_steps)]
    for iteration in range(len(t) - n_steps):
        X_batch = np.array(sequence2[-n_steps:]).reshape(1, n_steps, 1)
        y_pred = sess.run(outputs, feed_dict={X: X_batch})
        sequence2.append(y_pred[0, -1, 0])

plt.figure(figsize=(11,4))
plt.subplot(121)
plt.plot(t, sequence1, "b-")
plt.plot(t[:n_steps], sequence1[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
plt.ylabel("Value")

plt.subplot(122)
plt.plot(t, sequence2, "b-")
plt.plot(t[:n_steps], sequence2[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
save_fig("creative_sequence_plot")
plt.show()


INFO:tensorflow:Restoring parameters from ./my_time_series_model
Saving figure creative_sequence_plot

Deep RNN

MultiRNNCell


In [66]:
reset_graph()

n_inputs = 2
n_steps = 5

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])

In [67]:
n_neurons = 100
n_layers = 3

layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
          for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

In [68]:
init = tf.global_variables_initializer()

In [69]:
X_batch = rnd.rand(2, n_steps, n_inputs)

In [70]:
with tf.Session() as sess:
    init.run()
    outputs_val, states_val = sess.run([outputs, states], feed_dict={X: X_batch})

In [71]:
outputs_val.shape


Out[71]:
(2, 5, 100)

Distributing a Deep RNN Across Multiple GPUs

Do NOT do this:


In [72]:
with tf.device("/gpu:0"):  # BAD! This is ignored.
    layer1 = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)

with tf.device("/gpu:1"):  # BAD! Ignored again.
    layer2 = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)

Instead, you need a DeviceCellWrapper:


In [73]:
import tensorflow as tf

class DeviceCellWrapper(tf.contrib.rnn.RNNCell):
  def __init__(self, device, cell):
    self._cell = cell
    self._device = device

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def __call__(self, inputs, state, scope=None):
    with tf.device(self._device):
        return self._cell(inputs, state, scope)

In [74]:
reset_graph()

n_inputs = 5
n_steps = 20
n_neurons = 100

X = tf.placeholder(tf.float32, shape=[None, n_steps, n_inputs])

In [75]:
devices = ["/cpu:0", "/cpu:0", "/cpu:0"] # replace with ["/gpu:0", "/gpu:1", "/gpu:2"] if you have 3 GPUs
cells = [DeviceCellWrapper(dev,tf.contrib.rnn.BasicRNNCell(num_units=n_neurons))
         for dev in devices]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

In [76]:
init = tf.global_variables_initializer()

In [77]:
with tf.Session() as sess:
    init.run()
    print(sess.run(outputs, feed_dict={X: rnd.rand(2, n_steps, n_inputs)}))


[[[-0.06683909 -0.06814943  0.12806301 ..., -0.04951219  0.0169118
    0.09129722]
  [-0.03898398 -0.32816607  0.25709429 ..., -0.22360352 -0.00203764
    0.18901907]
  [-0.14598769 -0.03324183  0.06588719 ..., -0.36336255 -0.117153
    0.39544109]
  ..., 
  [-0.52596134  0.04002573  0.14033252 ...,  0.18522167  0.25101244
   -0.05308188]
  [-0.45618156 -0.11686647 -0.09905577 ..., -0.17943858  0.27567461
   -0.04363405]
  [-0.55723065  0.13874871 -0.14983818 ...,  0.04673974  0.10338999
   -0.03823486]]

 [[-0.0191099  -0.06458578  0.08206855 ..., -0.07772326 -0.05498064
    0.01358664]
  [-0.05150904 -0.36381066  0.0913103  ..., -0.12480559 -0.03924585
    0.06585156]
  [-0.29961377 -0.00120922  0.06789977 ..., -0.27556923 -0.15278165
    0.21452278]
  ..., 
  [-0.6460501   0.17479922  0.14066698 ..., -0.08995064 -0.03049678
    0.05738082]
  [-0.61097401 -0.17900243 -0.23193845 ..., -0.2500132   0.25146627
    0.36902413]
  [-0.25920284 -0.08149087  0.19740498 ..., -0.32611009 -0.02686078
    0.11232778]]]

Dropout


In [78]:
reset_graph()

n_inputs = 1
n_neurons = 100
n_layers = 3
n_steps = 20
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

In [79]:
keep_prob = 0.5

cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
         for layer in range(n_layers)]
cells_drop = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
              for cell in cells]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells_drop)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

In [80]:
learning_rate = 0.01

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

Unfortunately, this code is only usable for training, because the DropoutWrapper class has no training parameter, so it always applies dropout, even when the model is not being trained, so we must first train the model, then create a different model for testing, without the DropoutWrapper.


In [81]:
n_iterations = 1000
batch_size = 50

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        X_batch, y_batch = next_batch(batch_size, n_steps)
        _, mse = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch})
        if iteration % 100 == 0:
            print(iteration, "Training MSE:", mse)
    
    saver.save(sess, "./my_dropout_time_series_model")


0 Training MSE: 13.7079
100 Training MSE: 4.25301
200 Training MSE: 3.3346
300 Training MSE: 3.62894
400 Training MSE: 3.29399
500 Training MSE: 3.88701
600 Training MSE: 3.38845
700 Training MSE: 3.05871
800 Training MSE: 3.84628
900 Training MSE: 4.78431

Now that the model is trained, we need to create the model again, but without the DropoutWrapper for testing:


In [82]:
reset_graph()

n_inputs = 1
n_neurons = 100
n_layers = 3
n_steps = 20
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

keep_prob = 0.5

cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
         for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

learning_rate = 0.01

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

loss = tf.reduce_mean(tf.square(outputs - y))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [83]:
with tf.Session() as sess:
    saver.restore(sess, "./my_dropout_time_series_model")

    X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
    y_pred = sess.run(outputs, feed_dict={X: X_new})

plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")

plt.show()


INFO:tensorflow:Restoring parameters from ./my_dropout_time_series_model

Oops, it seems that Dropout does not help at all in this particular case. :/

Another option is to write a script with a command line argument to specify whether you want to train the mode or use it for making predictions:


In [84]:
reset_graph()

import sys
training = True  # in a script, this would be (sys.argv[-1] == "train") instead

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
         for layer in range(n_layers)]
if training:
    cells = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
             for cell in cells]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])    # not shown in the book
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs) # not shown
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])   # not shown
loss = tf.reduce_mean(tf.square(outputs - y))                     # not shown
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)   # not shown
training_op = optimizer.minimize(loss)                            # not shown
init = tf.global_variables_initializer()                          # not shown
saver = tf.train.Saver()                                          # not shown

with tf.Session() as sess:
    if training:
        init.run()
        for iteration in range(n_iterations):
            X_batch, y_batch = next_batch(batch_size, n_steps)    # not shown
            _, mse = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch}) # not shown
            if iteration % 100 == 0:                              # not shown
                print(iteration, "Training MSE:", mse)            # not shown
        save_path = saver.save(sess, "/tmp/my_model.ckpt")
    else:
        saver.restore(sess, "/tmp/my_model.ckpt")
        X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs))) # not shown
        y_pred = sess.run(outputs, feed_dict={X: X_new})                              # not shown


0 Training MSE: 13.6546
100 Training MSE: 4.41883
200 Training MSE: 3.23384
300 Training MSE: 3.71355
400 Training MSE: 2.6646
500 Training MSE: 3.77632
600 Training MSE: 3.0631
700 Training MSE: 3.56676
800 Training MSE: 3.84577
900 Training MSE: 4.79746

LSTM


In [85]:
reset_graph()

lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)

In [86]:
n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10
n_layers = 3

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
              for layer in range(n_layers)]
multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
top_layer_h_state = states[-1][1]
logits = tf.layers.dense(top_layer_h_state, n_outputs, name="softmax")
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

In [87]:
states


Out[87]:
(LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 150) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 150) dtype=float32>),
 LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 150) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_5:0' shape=(?, 150) dtype=float32>),
 LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_6:0' shape=(?, 150) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_7:0' shape=(?, 150) dtype=float32>))

In [88]:
top_layer_h_state


Out[88]:
<tf.Tensor 'rnn/while/Exit_7:0' shape=(?, 150) dtype=float32>

In [89]:
n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            X_batch = X_batch.reshape((batch_size, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)


Epoch 0 Train accuracy = 0.966667 Test accuracy = 0.9525
Epoch 1 Train accuracy = 0.993333 Test accuracy = 0.9747
Epoch 2 Train accuracy = 0.993333 Test accuracy = 0.9775
Epoch 3 Train accuracy = 0.993333 Test accuracy = 0.9813
Epoch 4 Train accuracy = 0.986667 Test accuracy = 0.9837
Epoch 5 Train accuracy = 1.0 Test accuracy = 0.9831
Epoch 6 Train accuracy = 1.0 Test accuracy = 0.9834
Epoch 7 Train accuracy = 0.993333 Test accuracy = 0.9862
Epoch 8 Train accuracy = 1.0 Test accuracy = 0.9863
Epoch 9 Train accuracy = 0.993333 Test accuracy = 0.9863

In [90]:
lstm_cell = tf.contrib.rnn.LSTMCell(num_units=n_neurons, use_peepholes=True)

In [91]:
gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)

Embeddings

This section is based on TensorFlow's Word2Vec tutorial.

Fetch the data


In [92]:
from six.moves import urllib

import errno
import os
import zipfile

WORDS_PATH = "datasets/words"
WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'

def mkdir_p(path):
    """Create directories, ok if they already exist.
    
    This is for python 2 support. In python >=3.2, simply use:
    >>> os.makedirs(path, exist_ok=True)
    """
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):
    os.makedirs(words_path, exist_ok=True)
    zip_path = os.path.join(words_path, "words.zip")
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(words_url, zip_path)
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    return data.decode("ascii").split()

In [93]:
words = fetch_words_data()

In [94]:
words[:5]


Out[94]:
['anarchism', 'originated', 'as', 'a', 'term']

Build the dictionary


In [95]:
from collections import Counter

vocabulary_size = 50000

vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}
data = np.array([dictionary.get(word, 0) for word in words])

In [96]:
" ".join(words[:9]), data[:9]


Out[96]:
('anarchism originated as a term of abuse first used',
 array([5241, 3082,   12,    6,  195,    2, 3136,   46,   59]))

In [97]:
" ".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])


Out[97]:
'anarchism didn as a term of presidency first used'

In [98]:
words[24], data[24]


Out[98]:
('culottes', 0)

Generate batches


In [99]:
import random
from collections import deque

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [100]:
data_index=0
batch, labels = generate_batch(8, 2, 1)

In [101]:
batch, [vocabulary[word] for word in batch]


Out[101]:
(array([3082, 3082,   12,   12,    6,    6,  195,  195], dtype=int32),
 ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term'])

In [102]:
labels, [vocabulary[word] for word in labels[:, 0]]


Out[102]:
(array([[  12],
        [5241],
        [3082],
        [   6],
        [ 195],
        [  12],
        [   2],
        [   6]], dtype=int32),
 ['as', 'anarchism', 'originated', 'a', 'term', 'as', 'of', 'a'])

Build the model


In [103]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = rnd.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

learning_rate = 0.01

In [104]:
reset_graph()

# Input data.
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [105]:
vocabulary_size = 50000
embedding_size = 150

# Look up embeddings for inputs.
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)

In [106]:
train_inputs = tf.placeholder(tf.int32, shape=[None])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [107]:
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
    tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
                   num_sampled, vocabulary_size))

# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Add variable initializer.
init = tf.global_variables_initializer()

Train the model


In [108]:
num_steps = 10001

with tf.Session() as session:
    init.run()

    average_loss = 0
    for step in range(num_steps):
        print("\rIteration: {}".format(step), end="\t")
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

        # We perform one update step by evaluating the training op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([training_op, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = vocabulary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = vocabulary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)

    final_embeddings = normalized_embeddings.eval()


Iteration: 0	Average loss at step  0 :  285.899108887
Nearest to would: employee, fayed, ladino, sadr, northamptonshire, epa, presidents, stiff,
Nearest to on: vigesimal, dim, mbit, conscientious, musics, molina, tarn, seminar,
Nearest to four: arches, evacuation, laser, alia, galveston, raced, latch, bandar,
Nearest to his: colloquial, mast, themes, someone, noir, streamline, value, merwara,
Nearest to often: milestone, mega, mboxx, antigen, vicki, overriding, adorno, anthony,
Nearest to in: sheltering, virtualization, petersen, appeals, weill, examine, compassion, browser,
Nearest to an: patchwork, orang, bethune, archaeological, sweat, mislead, keystroke, changeover,
Nearest to eight: notch, churchyard, mayfair, brightly, exertion, processing, monuc, reggae,
Nearest to these: aphrodite, malignancies, desired, eocene, bg, grandmother, checkpoint, nakano,
Nearest to nine: imr, blocks, lucy, learners, rett, recognising, aspects, relating,
Nearest to called: electrolyte, thompson, lojban, haken, tapestry, eutyches, mojo, plunge,
Nearest to about: gael, bravo, walsingham, octagonal, authorship, declarations, resettlement, fughetta,
Nearest to up: ifad, drives, nee, holmes, caligula, impulse, safeties, havel,
Nearest to one: indecent, egon, unequivocally, oppenheim, tla, alan, psyche, ellington,
Nearest to and: canal, berbers, secluded, leh, huac, etiquette, tajikistan, honneur,
Nearest to been: subdivided, unamended, vanes, memorandum, justifying, welwyn, linear, automation,
Iteration: 2000	Average loss at step  2000 :  130.957044463
Iteration: 4000	Average loss at step  4000 :  62.5069862733
Iteration: 6000	Average loss at step  6000 :  42.0973700013
Iteration: 8000	Average loss at step  8000 :  31.6292150426
Iteration: 10000	Average loss at step  10000 :  25.6433333195
Nearest to would: to, wrongly, floppy, was, bj, expenditure, mossad, int,
Nearest to on: in, four, seo, odessa, abscess, sqrt, satisfies, defunct,
Nearest to four: nine, zero, five, six, one, two, three, seven,
Nearest to his: the, s, chiefly, gage, botany, somali, arslan, died,
Nearest to often: cards, revolutionaries, bypasses, crm, carved, gide, mistakenly, and,
Nearest to in: of, and, on, two, accredited, the, nine, for,
Nearest to an: altaic, chlorine, achill, expedition, trilobites, columbus, depressed, a,
Nearest to eight: nine, one, seven, six, five, zero, three, four,
Nearest to these: delicate, bambaataa, appropriation, hanson, confirm, mathbb, columbus, contributes,
Nearest to nine: zero, one, six, seven, four, three, eight, five,
Nearest to called: used, insisting, handed, gallon, rematch, respondent, bind, victorious,
Nearest to about: honoria, pa, diet, finds, cosmos, holmes, ataxia, abstraction,
Nearest to up: silurian, condom, the, auld, archie, with, seo, raf,
Nearest to one: nine, three, eight, two, six, seven, five, four,
Nearest to and: astatine, in, the, of, zero, topalov, abdicated, UNK,
Nearest to been: have, archie, by, stg, pedals, was, ambients, it,

Let's save the final embeddings (of course you can use a TensorFlow Saver if you prefer):


In [109]:
np.save("./my_final_embeddings.npy", final_embeddings)

Plot the embeddings


In [110]:
def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

In [111]:
from sklearn.manifold import TSNE

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [vocabulary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)


Machine Translation

The basic_rnn_seq2seq() function creates a simple Encoder/Decoder model: it first runs an RNN to encode encoder_inputs into a state vector, then runs a decoder initialized with the last encoder state on decoder_inputs. Encoder and decoder use the same RNN cell type but they don't share parameters.


In [112]:
import tensorflow as tf
reset_graph()

n_steps = 50
n_neurons = 200
n_layers = 3
num_encoder_symbols = 20000
num_decoder_symbols = 20000
embedding_size = 150
learning_rate = 0.01

X = tf.placeholder(tf.int32, [None, n_steps]) # English sentences
Y = tf.placeholder(tf.int32, [None, n_steps]) # French translations
W = tf.placeholder(tf.float32, [None, n_steps - 1, 1])
Y_input = Y[:, :-1]
Y_target = Y[:, 1:]

encoder_inputs = tf.unstack(tf.transpose(X)) # list of 1D tensors
decoder_inputs = tf.unstack(tf.transpose(Y_input)) # list of 1D tensors

lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
              for layer in range(n_layers)]
cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)

output_seqs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
    encoder_inputs,
    decoder_inputs,
    cell,
    num_encoder_symbols,
    num_decoder_symbols,
    embedding_size)

logits = tf.transpose(tf.unstack(output_seqs), perm=[1, 0, 2])

In [113]:
logits_flat = tf.reshape(logits, [-1, num_decoder_symbols])
Y_target_flat = tf.reshape(Y_target, [-1])
W_flat = tf.reshape(W, [-1])
xentropy = W_flat * tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_target_flat, logits=logits_flat)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

Exercise solutions

Coming soon