In [ ]:
# Created 2016-04-06
# Tensorflow version: 0.7

# Like sparse_softmax_cross_entropy_with_logits, 
# tf.seq2seq.sequence_loss_by_example calculates the softmax cross entropy.
# (I am not sure why softmax/cross entropy is not mentioned in its name)
#
# The difference is that the calculation is done on a sequence of logits,
# where the sequence is a Python list. 
#
# In fact, one thing to help understand it is that in the simplest form, 
# it is just calling sparse_softmax_cross_entropy_with_logits on every 
# element of the sequence respectively.
# 
# It is mainly used for Recurrent Nueral Networks.

In [2]:
import numpy as np
import tensorflow as tf

In [9]:
# Snippet 1

# logits is a list of length 4, where 4 is called the sequence_length.
# Each element in the list is a matrix of shape: batch_size * classes.
# In this example batch_size = 2, classes = 3.
# This means we will have 3 different integers labels: 0, 1, and 2.
logits = [np.array([[1.5, 1.5, 1.5], 
                    [0.7, 0.9, 0.2]]), 
          np.array([[0.5, 0.2, 1.7],
                    [0.1, 1.2, 0.5]]),
          np.array([[0.7, 0.3, 1.2],
                    [1.1, 0.2, 0.5]]),
          np.array([[0.2, 0.4, 0.1],
                    [0.8, 0.8, 0.5]])]
          
# labels is a list of length 4 (sequence_length), similar to logits.
# Each element in labels is an integer vector of batch_size.
# Usually it represents the groud truth class label for each batch in
# each element of the sequence. Integer labels ranges from 0 to 2.
#
# This is similar to sparse_softmax_cross_entropy_with_logits.
labels = [np.array([0, 1], dtype=np.int32), 
          np.array([2, 0], dtype=np.int32),
          np.array([0, 0], dtype=np.int32),
          np.array([2, 1], dtype=np.int32)]

# weights is a list of length 4 (sequence_length).
# Each element in weights is an 1-D vector of batch_size.
#
# It is used to weight the cross entropy, and when set to all 1s,
# the cross entropy is not weighted.
weights = [np.array([1.0, 1.0]), np.array([1.0, 1.0]), 
           np.array([1.0, 1.0]), np.array([1.0, 1.0])]

# Finally call sequence_loss_by_example. Note that average_across_time_steps 
# is by default set to True (the name is self-explaining).
loss = tf.nn.seq2seq.sequence_loss_by_example(logits, labels, weights)
                                              
# ---------- Expected Value ----------
#
# Now let's do what sequence_loss_by_example, step by step.

# First, we calculate the cross entropies for each time step and each batch.
# The result is going to be a list of 4 (sequence_length), where each element
# is a vector of size 2 (batch_size)
expected_cross_entropies = [tf.nn.sparse_softmax_cross_entropy_with_logits(
    single_logits, single_labels) for single_logits, single_labels in zip(logits, labels)]

# We then sum the cross entropies across time steps, and divide the result by 4 (sequence_length).
expected_cross_entropies_average_over_time_steps = tf.accumulate_n(expected_cross_entropies) / 4

with tf.Session() as sess:
    print(sess.run(expected_cross_entropies))
    # We can see that they are the same
    print(sess.run(expected_cross_entropies_average_over_time_steps))
    print(sess.run(loss))


[array([ 1.09861229,  0.8395462 ]), array([ 0.42155128,  1.70401887]), array([ 1.19967598,  0.67058521]), array([ 1.23983106,  1.0082565 ])]
[ 0.98991765  1.05560169]
[ 0.98991765  1.05560169]