In [1]:

    
"""
tf v0.11

Following the toy implementation in seq2seq_number_seq01.ipynb,
to test the translation power of seq2seq, I insert a magic number at the back of the decoder seq.
e.g. [1,2,3,4,5] -> [2,3,4,5,6, (1+2+3+4+5) mod 7]

It turns out seq2seq is able to predict the magic number at a very small computational cost.
It is interesting to test how far seq2seq can go in guessing discrete value sequences.
"""

import numpy as np

# number of sentences to input in each training epoch
batch_size = 64

# number of words in one sentence
seq_length_input = 5
seq_length_output = 6

# number of possible words
vocab_size = 7

# embedding dimension
embedding_dim = 50

# number of hidden neuron in an rnn cell 
memory_dim = 100

# give a batch_size of translation pairs, example see next cell
def get_train_batch(batch_size):
    X = [np.random.choice(vocab_size, size=(seq_length_input,), replace=False)
         for _ in range(batch_size)]
    Y = np.mod(X + np.ones_like(X), 7)
    Y = np.c_[Y, np.mod(np.sum(X, axis=1), 7).reshape(batch_size, -1)]
    
    # Dimshuffle to seq_len * batch_size
    X = np.array(X).T
    Y = np.array(Y).T
    
    return X, Y

Example of a dataset



In [2]:

    
X, Y = get_train_batch(2)
print("Two data points:")
print(X)
print()
print("Two labels:")
print(Y)









    



Two data points:
[[0 1]
 [5 2]
 [4 4]
 [3 0]
 [6 3]]

Two labels:
[[1 2]
 [6 3]
 [5 5]
 [4 1]
 [0 4]
 [4 3]]

Define neural net and cost



In [3]:

    
import tensorflow as tf

# dim: seq_length_input x batch_size
encode_inputs = [tf.placeholder(tf.int32, shape=(None,),
                          name="inp%i" % t)
           for t in range(seq_length_input)]

# dim: seq_length_output x batch_size
labels = [tf.placeholder(tf.int32, shape=(None,),
                        name="labels%i" % t)
          for t in range(seq_length_output)]

# dim: seq_length_output x batch_size
weights = [tf.ones_like(labels_t, dtype=tf.float32)
           for labels_t in labels]

# decoder
decode_inputs = [tf.zeros_like(tensor) for tensor in labels]

# gru cell
cell = tf.nn.rnn_cell.GRUCell(memory_dim)

# decode_outputs dim: seq_length x batch_size x vocab size
decode_outputs, dec_memory = tf.nn.seq2seq.embedding_rnn_seq2seq(
    encode_inputs, decode_inputs, cell, vocab_size, vocab_size, embedding_dim)

# get the index of the largest number in vocab dimension -> this is going to be the predicted output
prediction = tf.argmax(decode_outputs, 2)

loss = tf.nn.seq2seq.sequence_loss(decode_outputs, labels, weights, vocab_size)

magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_memory[1])))

learning_rate = 0.05
momentum = 0.9
train_op = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(loss)

Training time



In [7]:

    
sess = tf.Session()
sess.run(tf.initialize_all_variables())

for t in range(1001):
    
    X, Y = get_train_batch(batch_size)
    
    feed_dict = {encode_inputs[t]: X[t] for t in range(seq_length_input)}
    feed_dict.update({labels[t]: Y[t] for t in range(seq_length_output)})

    _, predict_t, loss_t = sess.run([train_op, prediction, loss], feed_dict)
    
    if t%100 == 0:
        print('------ iteration', t, '-------')
        print('data', X[:,0])
        print('target', Y[:,0])
        print('predict', predict_t[:,0])
        print('loss', loss_t)
        print()









    



------ iteration 0 -------
data [0 6 4 1 2]
target [1 0 5 2 3 6]
predict [5 5 5 5 5 5]
loss 1.97378

------ iteration 100 -------
data [0 5 4 2 6]
target [1 6 5 3 0 3]
predict [1 3 3 3 3 0]
loss 1.7558

------ iteration 200 -------
data [4 3 5 0 6]
target [5 4 6 1 0 4]
predict [6 6 6 1 1 0]
loss 1.50417

------ iteration 300 -------
data [1 0 6 3 2]
target [2 1 0 4 3 5]
predict [2 2 4 0 3 3]
loss 1.23574

------ iteration 400 -------
data [1 0 2 6 4]
target [2 1 3 0 5 6]
predict [2 1 3 5 5 6]
loss 0.683473

------ iteration 500 -------
data [0 1 5 6 4]
target [1 2 6 0 5 2]
predict [1 2 6 0 5 2]
loss 0.325934

------ iteration 600 -------
data [1 6 0 2 4]
target [2 0 1 3 5 6]
predict [2 0 1 3 5 6]
loss 0.138587

------ iteration 700 -------
data [0 1 4 5 2]
target [1 2 5 6 3 5]
predict [1 2 5 6 3 5]
loss 0.0515718

------ iteration 800 -------
data [3 2 4 0 5]
target [4 3 5 1 6 0]
predict [4 3 5 1 6 0]
loss 0.0327763

------ iteration 900 -------
data [0 2 6 1 5]
target [1 3 0 2 6 0]
predict [1 3 0 2 6 0]
loss 0.0167957

------ iteration 1000 -------
data [2 6 0 4 1]
target [3 0 1 5 2 6]
predict [3 0 1 5 2 6]
loss 0.0105417



In [ ]: