Tensorboard example


In [ ]:
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [ ]:
with open('anna.txt', 'r') as f:
    text=f.read()
vocab = set(text)
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [ ]:
text[:100]

In [ ]:
encoded[:100]

Since the network is working with individual characters, it's similar to a classification problem in which we are trying to predict the next character from the previous text. Here's how many 'classes' our network has to pick from.


In [ ]:
len(vocab)

In [ ]:
def get_batches(arr, n_seqs, n_steps_per_seq):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: Batch size, the number of sequences per batch
       n_steps: Number of sequence steps per batch
    '''
    # Get the batch size and number of batches we can make
    # ie n_seq = 10, n_steps_per_sew = 2, batch_size = 20
    
    batch_size = n_seqs * n_steps_per_seq
    
    # ie arr= 40, over 20, so 2 batches
    n_batches = len(arr) // batch_size
    
    # Keep only enough characters to make full batches
    # n_batches = 2 * batch_size = 20 = 40??
    # why not simply use len(arr)?
    
    arr = arr[ : n_batches * batch_size]
    
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps_per_seq):
        # The features
        x = arr[ :, n: n + n_steps_per_seq]
        # The targets, shifted by one
        y = np.zeros_like(x)
        y[ :, : -1], y[ : , -1] = x[ :, 1: ], x[ :, 0]
        yield x, y

In [ ]:
batches = get_batches(encoded, 10, 50)
x, y = next(batches)

In [ ]:
def build_inputs(batch_size, num_steps):
    ''' Define placeholders for inputs, targets, and dropout 
    
        Arguments
        ---------
        batch_size: Batch size, number of sequences per batch
        num_steps: Number of sequence steps in a batch
        
    '''
    with tf.name_scope('inputs'):
        # Declare placeholders we'll feed into the graph
        inputs = tf.placeholder(tf.int32, (batch_size, num_steps), name="inputs")
        targets = tf.placeholder(tf.int32, (batch_size, num_steps), name="targets")

        # Keep probability placeholder for drop out layers
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

In [ ]:
def single_lstm_cell(lstm_size, keep_prob):
    
    with tf.name_scope("RNN_layers"):
        lstm = tf.contrib.rnn.NASCell(lstm_size, reuse = tf.get_variable_scope().reuse)
    
        # Add dropout to the cell outputs
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
    
    return drop

In [11]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size

    '''
    ### Build the LSTM Cell
    
    # Stack up multiple LSTM layers, for deep learning
    
    with tf.name_scope("RNN_layers"):
        rnn_cells = tf.contrib.rnn.MultiRNNCell([single_lstm_cell(lstm_size, keep_prob) for _ in range(num_layers)], 
                                               state_is_tuple = True)
        
    with tf.name_scope("RNN_init_state"):
        initial_state = rnn_cells.zero_state(batch_size, tf.float32)
    
    return rnn_cells, initial_state

In [12]:
def build_output(lstm_output, in_size, out_size):
    ''' Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        lstm_output: List of output tensors from the LSTM layer
        in_size: Size of the input tensor, for example, size of the LSTM cells
        out_size: Size of this softmax layer
    
    '''

    # Reshape output so it's a bunch of rows, one row for each step for each sequence.
    
    # Concatenate lstm_output over axis 1 (the columns)
    # ie t1 = t1 = [[1, 2, 3], [4, 5, 6]]
    # t2 = [[7, 8, 9], [10, 11, 12]]
    # tf.concat([t1, t2], 1) ==> [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
    seq_output = tf.concat(lstm_output, axis=1)
    
    # Reshape seq_output to a 2D tensor with lstm_size columns
    x = tf.reshape(lstm_output, [-1, in_size])
    
    # Connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        # Create the weight and bias variables here
        softmax_w = tf.Variable(tf.truncated_normal( (in_size, out_size), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros( out_size ))
        
        # tensorboard
        tf.summary.histogram("softmax_w", softmax_w)
    
    # Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
    # of rows of logit outputs, one for each step and sequence
    logits = tf.matmul(x,  softmax_w) + softmax_b
    
    # Use softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name="predictions")
    tf.summary.histogram("predictions", out)
    
    return out, logits

In [13]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets
        
    '''
    
    # One-hot encode targets and reshape to match logits, one row per sequence per step
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped =  tf.reshape( y_one_hot, logits.get_shape() )
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    
    # tensorboard
    tf.summary.scalar('loss', loss)
    
    return loss

In [14]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer
    
    '''
    
    # Optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [15]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # When we're using this network for sampling later, we'll be passing in
        # one character at a time, so providing an option for that
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # Build the input placeholder tensors
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        x_one_hot = tf.one_hot(self.inputs, num_classes, name="x_one_hot")

        with tf.name_scope("RNN_layers"):
            # Build the LSTM cell
            cells, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
    

        ### Run the data through the RNN layers
        with tf.name_scope("RNN_forward"):
        # Run each sequence step through the RNN with tf.nn.dynamic_rnn 
            outputs, state = tf.nn.dynamic_rnn(cells, x_one_hot, initial_state=self.initial_state)
        
        
        self.final_state = state
        
        # Get softmax predictions and logits
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss =  build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

In [16]:
batch_size = 64         # Sequences per batch
num_steps = 128          # Number of sequence steps per batch
lstm_size = 512         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.001    # Learning rate
keep_prob = 0.5         # Dropout keep probability

In [17]:
model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

In [19]:
epochs = 3
# Save every N iterations
save_every_n = 200

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Tensoboard
    train_writer = tf.summary.FileWriter('./logs/1/train', sess.graph)
    test_writer = tf.summary.FileWriter('./logs/1/test')
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            
            merged = tf.summary.merge_all() # Tensorboard
            summary, batch_loss, new_state, _ = sess.run([merged, model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            train_writer.add_summary(summary, counter)
            end = time.time()
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))


Epoch: 1/3...  Training Step: 1...  Training loss: 4.4191...  0.7205 sec/batch
Epoch: 1/3...  Training Step: 2...  Training loss: 4.4164...  0.5005 sec/batch
Epoch: 1/3...  Training Step: 3...  Training loss: 4.4136...  0.5001 sec/batch
Epoch: 1/3...  Training Step: 4...  Training loss: 4.4104...  0.4955 sec/batch
Epoch: 1/3...  Training Step: 5...  Training loss: 4.4066...  0.4832 sec/batch
Epoch: 1/3...  Training Step: 6...  Training loss: 4.4017...  0.5049 sec/batch
Epoch: 1/3...  Training Step: 7...  Training loss: 4.3954...  0.4806 sec/batch
Epoch: 1/3...  Training Step: 8...  Training loss: 4.3867...  0.5163 sec/batch
Epoch: 1/3...  Training Step: 9...  Training loss: 4.3744...  0.5002 sec/batch
Epoch: 1/3...  Training Step: 10...  Training loss: 4.3562...  0.5011 sec/batch
Epoch: 1/3...  Training Step: 11...  Training loss: 4.3280...  0.4996 sec/batch
Epoch: 1/3...  Training Step: 12...  Training loss: 4.2707...  0.4847 sec/batch
Epoch: 1/3...  Training Step: 13...  Training loss: 4.1100...  0.5005 sec/batch
Epoch: 1/3...  Training Step: 14...  Training loss: 3.8801...  0.5065 sec/batch
Epoch: 1/3...  Training Step: 15...  Training loss: 3.7083...  0.5000 sec/batch
Epoch: 1/3...  Training Step: 16...  Training loss: 3.4019...  0.5166 sec/batch
Epoch: 1/3...  Training Step: 17...  Training loss: 3.3170...  0.5203 sec/batch
Epoch: 1/3...  Training Step: 18...  Training loss: 3.3606...  0.5186 sec/batch
Epoch: 1/3...  Training Step: 19...  Training loss: 3.3030...  0.5239 sec/batch
Epoch: 1/3...  Training Step: 20...  Training loss: 3.2861...  0.5237 sec/batch
Epoch: 1/3...  Training Step: 21...  Training loss: 3.3011...  0.5279 sec/batch
Epoch: 1/3...  Training Step: 22...  Training loss: 3.3019...  0.5222 sec/batch
Epoch: 1/3...  Training Step: 23...  Training loss: 3.3059...  0.5459 sec/batch
Epoch: 1/3...  Training Step: 24...  Training loss: 3.2711...  0.5194 sec/batch
Epoch: 1/3...  Training Step: 25...  Training loss: 3.2848...  0.4962 sec/batch
Epoch: 1/3...  Training Step: 26...  Training loss: 3.2100...  0.5005 sec/batch
Epoch: 1/3...  Training Step: 27...  Training loss: 3.2075...  0.5184 sec/batch
Epoch: 1/3...  Training Step: 28...  Training loss: 3.2106...  0.5127 sec/batch
Epoch: 1/3...  Training Step: 29...  Training loss: 3.1898...  0.5171 sec/batch
Epoch: 1/3...  Training Step: 30...  Training loss: 3.1892...  0.5084 sec/batch
Epoch: 1/3...  Training Step: 31...  Training loss: 3.2124...  0.5892 sec/batch
Epoch: 1/3...  Training Step: 32...  Training loss: 3.1842...  0.5501 sec/batch
Epoch: 1/3...  Training Step: 33...  Training loss: 3.2244...  0.5677 sec/batch
Epoch: 1/3...  Training Step: 34...  Training loss: 3.2147...  0.5401 sec/batch
Epoch: 1/3...  Training Step: 35...  Training loss: 3.1993...  0.5451 sec/batch
Epoch: 1/3...  Training Step: 36...  Training loss: 3.1708...  0.5590 sec/batch
Epoch: 1/3...  Training Step: 37...  Training loss: 3.1877...  0.5186 sec/batch
Epoch: 1/3...  Training Step: 38...  Training loss: 3.1853...  0.5243 sec/batch
Epoch: 1/3...  Training Step: 39...  Training loss: 3.2013...  0.5236 sec/batch
Epoch: 1/3...  Training Step: 40...  Training loss: 3.1533...  0.5445 sec/batch
Epoch: 1/3...  Training Step: 41...  Training loss: 3.1635...  0.5267 sec/batch
Epoch: 1/3...  Training Step: 42...  Training loss: 3.1570...  0.5181 sec/batch
Epoch: 1/3...  Training Step: 43...  Training loss: 3.1591...  0.5213 sec/batch
Epoch: 1/3...  Training Step: 44...  Training loss: 3.1567...  0.4904 sec/batch
Epoch: 1/3...  Training Step: 45...  Training loss: 3.1777...  0.5006 sec/batch
Epoch: 1/3...  Training Step: 46...  Training loss: 3.1566...  0.5003 sec/batch
Epoch: 1/3...  Training Step: 47...  Training loss: 3.1885...  0.5371 sec/batch
Epoch: 1/3...  Training Step: 48...  Training loss: 3.1526...  0.4933 sec/batch
Epoch: 1/3...  Training Step: 49...  Training loss: 3.1552...  0.4889 sec/batch
Epoch: 1/3...  Training Step: 50...  Training loss: 3.1474...  0.5120 sec/batch
Epoch: 1/3...  Training Step: 51...  Training loss: 3.1654...  0.5110 sec/batch
Epoch: 1/3...  Training Step: 52...  Training loss: 3.1557...  0.6216 sec/batch
Epoch: 1/3...  Training Step: 53...  Training loss: 3.1379...  0.5661 sec/batch
Epoch: 1/3...  Training Step: 54...  Training loss: 3.1443...  0.5576 sec/batch
Epoch: 1/3...  Training Step: 55...  Training loss: 3.1554...  0.5450 sec/batch
Epoch: 1/3...  Training Step: 56...  Training loss: 3.1562...  0.5678 sec/batch
Epoch: 1/3...  Training Step: 57...  Training loss: 3.1650...  0.5110 sec/batch
Epoch: 1/3...  Training Step: 58...  Training loss: 3.1539...  0.5172 sec/batch
Epoch: 1/3...  Training Step: 59...  Training loss: 3.1464...  0.5337 sec/batch
Epoch: 1/3...  Training Step: 60...  Training loss: 3.1779...  0.4916 sec/batch
Epoch: 1/3...  Training Step: 61...  Training loss: 3.1566...  0.5383 sec/batch
Epoch: 1/3...  Training Step: 62...  Training loss: 3.1512...  0.5486 sec/batch
Epoch: 1/3...  Training Step: 63...  Training loss: 3.1556...  0.5007 sec/batch
Epoch: 1/3...  Training Step: 64...  Training loss: 3.1490...  0.4947 sec/batch
Epoch: 1/3...  Training Step: 65...  Training loss: 3.1620...  0.5029 sec/batch
Epoch: 1/3...  Training Step: 66...  Training loss: 3.1578...  0.5005 sec/batch
Epoch: 1/3...  Training Step: 67...  Training loss: 3.1370...  0.5006 sec/batch
Epoch: 1/3...  Training Step: 68...  Training loss: 3.1490...  0.5024 sec/batch
Epoch: 1/3...  Training Step: 69...  Training loss: 3.1683...  0.5135 sec/batch
Epoch: 1/3...  Training Step: 70...  Training loss: 3.1493...  0.5114 sec/batch
Epoch: 1/3...  Training Step: 71...  Training loss: 3.1756...  0.4883 sec/batch
Epoch: 1/3...  Training Step: 72...  Training loss: 3.1229...  0.5147 sec/batch
Epoch: 1/3...  Training Step: 73...  Training loss: 3.1519...  0.4983 sec/batch
Epoch: 1/3...  Training Step: 74...  Training loss: 3.1618...  0.4996 sec/batch
Epoch: 1/3...  Training Step: 75...  Training loss: 3.1456...  0.5302 sec/batch
Epoch: 1/3...  Training Step: 76...  Training loss: 3.1426...  0.5340 sec/batch
Epoch: 1/3...  Training Step: 77...  Training loss: 3.1276...  0.5409 sec/batch
Epoch: 1/3...  Training Step: 78...  Training loss: 3.1403...  0.4985 sec/batch
Epoch: 1/3...  Training Step: 79...  Training loss: 3.1484...  0.5438 sec/batch
Epoch: 1/3...  Training Step: 80...  Training loss: 3.1362...  0.5561 sec/batch
Epoch: 1/3...  Training Step: 81...  Training loss: 3.1681...  0.5364 sec/batch
Epoch: 1/3...  Training Step: 82...  Training loss: 3.1552...  0.5297 sec/batch
Epoch: 1/3...  Training Step: 83...  Training loss: 3.1361...  0.5191 sec/batch
Epoch: 1/3...  Training Step: 84...  Training loss: 3.1269...  0.5168 sec/batch
Epoch: 1/3...  Training Step: 85...  Training loss: 3.1488...  0.5412 sec/batch
Epoch: 1/3...  Training Step: 86...  Training loss: 3.1427...  0.5024 sec/batch
Epoch: 1/3...  Training Step: 87...  Training loss: 3.1502...  0.5116 sec/batch
Epoch: 1/3...  Training Step: 88...  Training loss: 3.1386...  0.5426 sec/batch
Epoch: 1/3...  Training Step: 89...  Training loss: 3.1538...  0.6118 sec/batch
Epoch: 1/3...  Training Step: 90...  Training loss: 3.1288...  0.5395 sec/batch
Epoch: 1/3...  Training Step: 91...  Training loss: 3.1141...  0.4964 sec/batch
Epoch: 1/3...  Training Step: 92...  Training loss: 3.1599...  0.5533 sec/batch
Epoch: 1/3...  Training Step: 93...  Training loss: 3.1249...  0.5003 sec/batch
Epoch: 1/3...  Training Step: 94...  Training loss: 3.1232...  0.5325 sec/batch
Epoch: 1/3...  Training Step: 95...  Training loss: 3.1634...  0.5278 sec/batch
Epoch: 1/3...  Training Step: 96...  Training loss: 3.1254...  0.4982 sec/batch
Epoch: 1/3...  Training Step: 97...  Training loss: 3.1521...  0.5255 sec/batch
Epoch: 1/3...  Training Step: 98...  Training loss: 3.1198...  0.5004 sec/batch
Epoch: 1/3...  Training Step: 99...  Training loss: 3.1246...  0.5244 sec/batch
Epoch: 1/3...  Training Step: 100...  Training loss: 3.1324...  0.5448 sec/batch
Epoch: 1/3...  Training Step: 101...  Training loss: 3.1177...  0.5271 sec/batch
Epoch: 1/3...  Training Step: 102...  Training loss: 3.1181...  0.4925 sec/batch
Epoch: 1/3...  Training Step: 103...  Training loss: 3.1410...  0.4915 sec/batch
Epoch: 1/3...  Training Step: 104...  Training loss: 3.1219...  0.5105 sec/batch
Epoch: 1/3...  Training Step: 105...  Training loss: 3.1081...  0.5373 sec/batch
Epoch: 1/3...  Training Step: 106...  Training loss: 3.1387...  0.4995 sec/batch
Epoch: 1/3...  Training Step: 107...  Training loss: 3.1457...  0.5194 sec/batch
Epoch: 1/3...  Training Step: 108...  Training loss: 3.1274...  0.5374 sec/batch
Epoch: 1/3...  Training Step: 109...  Training loss: 3.1236...  0.5314 sec/batch
Epoch: 1/3...  Training Step: 110...  Training loss: 3.1242...  0.5183 sec/batch
Epoch: 1/3...  Training Step: 111...  Training loss: 3.1244...  0.4932 sec/batch
Epoch: 1/3...  Training Step: 112...  Training loss: 3.1236...  0.4828 sec/batch
Epoch: 1/3...  Training Step: 113...  Training loss: 3.1184...  0.5034 sec/batch
Epoch: 1/3...  Training Step: 114...  Training loss: 3.1179...  0.5300 sec/batch
Epoch: 1/3...  Training Step: 115...  Training loss: 3.1441...  0.5045 sec/batch
Epoch: 1/3...  Training Step: 116...  Training loss: 3.1340...  0.5000 sec/batch
Epoch: 1/3...  Training Step: 117...  Training loss: 3.1128...  0.5441 sec/batch
Epoch: 1/3...  Training Step: 118...  Training loss: 3.1239...  0.5536 sec/batch
Epoch: 1/3...  Training Step: 119...  Training loss: 3.1191...  0.5856 sec/batch
Epoch: 1/3...  Training Step: 120...  Training loss: 3.1280...  0.5631 sec/batch
Epoch: 1/3...  Training Step: 121...  Training loss: 3.0985...  0.5489 sec/batch
Epoch: 1/3...  Training Step: 122...  Training loss: 3.1120...  0.5451 sec/batch
Epoch: 1/3...  Training Step: 123...  Training loss: 3.1195...  0.5294 sec/batch
Epoch: 1/3...  Training Step: 124...  Training loss: 3.1286...  0.5332 sec/batch
Epoch: 1/3...  Training Step: 125...  Training loss: 3.1505...  0.5069 sec/batch
Epoch: 1/3...  Training Step: 126...  Training loss: 3.1414...  0.5822 sec/batch
Epoch: 1/3...  Training Step: 127...  Training loss: 3.1051...  0.5342 sec/batch
Epoch: 1/3...  Training Step: 128...  Training loss: 3.1259...  0.5142 sec/batch
Epoch: 1/3...  Training Step: 129...  Training loss: 3.1153...  0.5221 sec/batch
Epoch: 1/3...  Training Step: 130...  Training loss: 3.1082...  0.4885 sec/batch
Epoch: 1/3...  Training Step: 131...  Training loss: 3.1041...  0.5342 sec/batch
Epoch: 1/3...  Training Step: 132...  Training loss: 3.1056...  0.5181 sec/batch
Epoch: 1/3...  Training Step: 133...  Training loss: 3.1070...  0.5157 sec/batch
Epoch: 1/3...  Training Step: 134...  Training loss: 3.0994...  0.5300 sec/batch
Epoch: 1/3...  Training Step: 135...  Training loss: 3.1358...  0.5793 sec/batch
Epoch: 1/3...  Training Step: 136...  Training loss: 3.1339...  0.5597 sec/batch
Epoch: 1/3...  Training Step: 137...  Training loss: 3.1131...  0.5309 sec/batch
Epoch: 1/3...  Training Step: 138...  Training loss: 3.0987...  0.5214 sec/batch
Epoch: 1/3...  Training Step: 139...  Training loss: 3.1037...  0.5131 sec/batch
Epoch: 1/3...  Training Step: 140...  Training loss: 3.1314...  0.5213 sec/batch
Epoch: 1/3...  Training Step: 141...  Training loss: 3.0994...  0.5107 sec/batch
Epoch: 1/3...  Training Step: 142...  Training loss: 3.0943...  0.5004 sec/batch
Epoch: 1/3...  Training Step: 143...  Training loss: 3.1105...  0.5494 sec/batch
Epoch: 1/3...  Training Step: 144...  Training loss: 3.0922...  0.5079 sec/batch
Epoch: 1/3...  Training Step: 145...  Training loss: 3.1159...  0.5191 sec/batch
Epoch: 1/3...  Training Step: 146...  Training loss: 3.1074...  0.5063 sec/batch
Epoch: 1/3...  Training Step: 147...  Training loss: 3.1010...  0.5248 sec/batch
Epoch: 1/3...  Training Step: 148...  Training loss: 3.1088...  0.5309 sec/batch
Epoch: 1/3...  Training Step: 149...  Training loss: 3.1266...  0.5173 sec/batch
Epoch: 1/3...  Training Step: 150...  Training loss: 3.1193...  0.5284 sec/batch
Epoch: 1/3...  Training Step: 151...  Training loss: 3.1020...  0.5272 sec/batch
Epoch: 1/3...  Training Step: 152...  Training loss: 3.1155...  0.5051 sec/batch
Epoch: 1/3...  Training Step: 153...  Training loss: 3.1172...  0.5138 sec/batch
Epoch: 1/3...  Training Step: 154...  Training loss: 3.1014...  0.5028 sec/batch
Epoch: 1/3...  Training Step: 155...  Training loss: 3.0989...  0.5169 sec/batch
Epoch: 1/3...  Training Step: 156...  Training loss: 3.0677...  0.5012 sec/batch
Epoch: 1/3...  Training Step: 157...  Training loss: 3.0917...  0.5158 sec/batch
Epoch: 1/3...  Training Step: 158...  Training loss: 3.0753...  0.5143 sec/batch
Epoch: 1/3...  Training Step: 159...  Training loss: 3.0880...  0.5069 sec/batch
Epoch: 1/3...  Training Step: 160...  Training loss: 3.0859...  0.4973 sec/batch
Epoch: 1/3...  Training Step: 161...  Training loss: 3.0836...  0.5073 sec/batch
Epoch: 1/3...  Training Step: 162...  Training loss: 3.0660...  0.5096 sec/batch
Epoch: 1/3...  Training Step: 163...  Training loss: 3.0575...  0.5172 sec/batch
Epoch: 1/3...  Training Step: 164...  Training loss: 3.0516...  0.5004 sec/batch
Epoch: 1/3...  Training Step: 165...  Training loss: 3.0689...  0.5111 sec/batch
Epoch: 1/3...  Training Step: 166...  Training loss: 3.0731...  0.5052 sec/batch
Epoch: 1/3...  Training Step: 167...  Training loss: 3.0922...  0.5053 sec/batch
Epoch: 1/3...  Training Step: 168...  Training loss: 3.0509...  0.4946 sec/batch
Epoch: 1/3...  Training Step: 169...  Training loss: 3.0757...  0.4999 sec/batch
Epoch: 1/3...  Training Step: 170...  Training loss: 3.0792...  0.5000 sec/batch
Epoch: 1/3...  Training Step: 171...  Training loss: 3.0632...  0.4966 sec/batch
Epoch: 1/3...  Training Step: 172...  Training loss: 3.0606...  0.5202 sec/batch
Epoch: 1/3...  Training Step: 173...  Training loss: 3.0520...  0.5010 sec/batch
Epoch: 1/3...  Training Step: 174...  Training loss: 3.0413...  0.4990 sec/batch
Epoch: 1/3...  Training Step: 175...  Training loss: 3.0465...  0.5187 sec/batch
Epoch: 1/3...  Training Step: 176...  Training loss: 3.0115...  0.5035 sec/batch
Epoch: 1/3...  Training Step: 177...  Training loss: 3.0234...  0.4990 sec/batch
Epoch: 1/3...  Training Step: 178...  Training loss: 3.0479...  0.4998 sec/batch
Epoch: 1/3...  Training Step: 179...  Training loss: 3.0261...  0.5000 sec/batch
Epoch: 1/3...  Training Step: 180...  Training loss: 3.0382...  0.5065 sec/batch
Epoch: 1/3...  Training Step: 181...  Training loss: 3.0056...  0.5268 sec/batch
Epoch: 1/3...  Training Step: 182...  Training loss: 3.0226...  0.5443 sec/batch
Epoch: 1/3...  Training Step: 183...  Training loss: 3.0241...  0.5239 sec/batch
Epoch: 1/3...  Training Step: 184...  Training loss: 3.0322...  0.5027 sec/batch
Epoch: 1/3...  Training Step: 185...  Training loss: 3.0104...  0.5287 sec/batch
Epoch: 1/3...  Training Step: 186...  Training loss: 3.0093...  0.5373 sec/batch
Epoch: 1/3...  Training Step: 187...  Training loss: 2.9884...  0.4952 sec/batch
Epoch: 1/3...  Training Step: 188...  Training loss: 3.0011...  0.5022 sec/batch
Epoch: 1/3...  Training Step: 189...  Training loss: 3.0025...  0.5025 sec/batch
Epoch: 1/3...  Training Step: 190...  Training loss: 2.9902...  0.5002 sec/batch
Epoch: 1/3...  Training Step: 191...  Training loss: 2.9750...  0.5165 sec/batch
Epoch: 1/3...  Training Step: 192...  Training loss: 2.9817...  0.5003 sec/batch
Epoch: 1/3...  Training Step: 193...  Training loss: 2.9633...  0.5155 sec/batch
Epoch: 1/3...  Training Step: 194...  Training loss: 2.9843...  0.5109 sec/batch
Epoch: 1/3...  Training Step: 195...  Training loss: 2.9262...  0.4889 sec/batch
Epoch: 1/3...  Training Step: 196...  Training loss: 2.9407...  0.5030 sec/batch
Epoch: 1/3...  Training Step: 197...  Training loss: 2.9243...  0.4989 sec/batch
Epoch: 1/3...  Training Step: 198...  Training loss: 2.9258...  0.5188 sec/batch
Epoch: 1/3...  Training Step: 199...  Training loss: 2.9501...  0.5136 sec/batch
Epoch: 1/3...  Training Step: 200...  Training loss: 2.9362...  0.5001 sec/batch
Epoch: 1/3...  Training Step: 201...  Training loss: 2.9081...  0.5167 sec/batch
Epoch: 1/3...  Training Step: 202...  Training loss: 2.9295...  0.5158 sec/batch
Epoch: 1/3...  Training Step: 203...  Training loss: 2.9601...  0.5142 sec/batch
Epoch: 1/3...  Training Step: 204...  Training loss: 2.9409...  0.5321 sec/batch
Epoch: 1/3...  Training Step: 205...  Training loss: 2.9040...  0.5576 sec/batch
Epoch: 1/3...  Training Step: 206...  Training loss: 2.8826...  0.5224 sec/batch
Epoch: 1/3...  Training Step: 207...  Training loss: 2.8515...  0.5209 sec/batch
Epoch: 1/3...  Training Step: 208...  Training loss: 2.8641...  0.5004 sec/batch
Epoch: 1/3...  Training Step: 209...  Training loss: 2.8108...  0.5354 sec/batch
Epoch: 1/3...  Training Step: 210...  Training loss: 2.8172...  0.5330 sec/batch
Epoch: 1/3...  Training Step: 211...  Training loss: 2.8143...  0.5537 sec/batch
Epoch: 1/3...  Training Step: 212...  Training loss: 2.8492...  0.5153 sec/batch
Epoch: 1/3...  Training Step: 213...  Training loss: 2.8509...  0.5006 sec/batch
Epoch: 1/3...  Training Step: 214...  Training loss: 2.7834...  0.5235 sec/batch
Epoch: 1/3...  Training Step: 215...  Training loss: 2.7728...  0.5459 sec/batch
Epoch: 1/3...  Training Step: 216...  Training loss: 2.7916...  0.5003 sec/batch
Epoch: 1/3...  Training Step: 217...  Training loss: 2.7676...  0.5529 sec/batch
Epoch: 1/3...  Training Step: 218...  Training loss: 2.7732...  0.5198 sec/batch
Epoch: 1/3...  Training Step: 219...  Training loss: 2.7472...  0.5414 sec/batch
Epoch: 1/3...  Training Step: 220...  Training loss: 2.7525...  0.5472 sec/batch
Epoch: 1/3...  Training Step: 221...  Training loss: 2.7333...  0.5251 sec/batch
Epoch: 1/3...  Training Step: 222...  Training loss: 2.7393...  0.5273 sec/batch
Epoch: 1/3...  Training Step: 223...  Training loss: 2.7280...  0.5225 sec/batch
Epoch: 1/3...  Training Step: 224...  Training loss: 2.7421...  0.5381 sec/batch
Epoch: 1/3...  Training Step: 225...  Training loss: 2.7122...  0.5489 sec/batch
Epoch: 1/3...  Training Step: 226...  Training loss: 2.6896...  0.5211 sec/batch
Epoch: 1/3...  Training Step: 227...  Training loss: 2.6976...  0.5634 sec/batch
Epoch: 1/3...  Training Step: 228...  Training loss: 2.7139...  0.5165 sec/batch
Epoch: 1/3...  Training Step: 229...  Training loss: 2.7311...  0.5877 sec/batch
Epoch: 1/3...  Training Step: 230...  Training loss: 2.7034...  0.5263 sec/batch
Epoch: 1/3...  Training Step: 231...  Training loss: 2.6705...  0.5704 sec/batch
Epoch: 1/3...  Training Step: 232...  Training loss: 2.7066...  0.5532 sec/batch
Epoch: 1/3...  Training Step: 233...  Training loss: 2.6796...  0.5245 sec/batch
Epoch: 1/3...  Training Step: 234...  Training loss: 2.6540...  0.5021 sec/batch
Epoch: 1/3...  Training Step: 235...  Training loss: 2.6614...  0.5168 sec/batch
Epoch: 1/3...  Training Step: 236...  Training loss: 2.6449...  0.5177 sec/batch
Epoch: 1/3...  Training Step: 237...  Training loss: 2.6534...  0.5253 sec/batch
Epoch: 1/3...  Training Step: 238...  Training loss: 2.6438...  0.5362 sec/batch
Epoch: 1/3...  Training Step: 239...  Training loss: 2.6266...  0.5714 sec/batch
Epoch: 1/3...  Training Step: 240...  Training loss: 2.6088...  0.5194 sec/batch
Epoch: 1/3...  Training Step: 241...  Training loss: 2.6285...  0.5011 sec/batch
Epoch: 1/3...  Training Step: 242...  Training loss: 2.6038...  0.5217 sec/batch
Epoch: 2/3...  Training Step: 243...  Training loss: 2.6573...  0.4987 sec/batch
Epoch: 2/3...  Training Step: 244...  Training loss: 2.5916...  0.5150 sec/batch
Epoch: 2/3...  Training Step: 245...  Training loss: 2.5477...  0.4984 sec/batch
Epoch: 2/3...  Training Step: 246...  Training loss: 2.5658...  0.5082 sec/batch
Epoch: 2/3...  Training Step: 247...  Training loss: 2.5518...  0.4960 sec/batch
Epoch: 2/3...  Training Step: 248...  Training loss: 2.5825...  0.5165 sec/batch
Epoch: 2/3...  Training Step: 249...  Training loss: 2.5567...  0.5069 sec/batch
Epoch: 2/3...  Training Step: 250...  Training loss: 2.5663...  0.4926 sec/batch
Epoch: 2/3...  Training Step: 251...  Training loss: 2.5484...  0.4982 sec/batch
Epoch: 2/3...  Training Step: 252...  Training loss: 2.5474...  0.5027 sec/batch
Epoch: 2/3...  Training Step: 253...  Training loss: 2.5844...  0.5009 sec/batch
Epoch: 2/3...  Training Step: 254...  Training loss: 2.5271...  0.5787 sec/batch
Epoch: 2/3...  Training Step: 255...  Training loss: 2.5323...  0.5553 sec/batch
Epoch: 2/3...  Training Step: 256...  Training loss: 2.5198...  0.5342 sec/batch
Epoch: 2/3...  Training Step: 257...  Training loss: 2.5359...  0.5010 sec/batch
Epoch: 2/3...  Training Step: 258...  Training loss: 2.5187...  0.5044 sec/batch
Epoch: 2/3...  Training Step: 259...  Training loss: 2.5017...  0.5144 sec/batch
Epoch: 2/3...  Training Step: 260...  Training loss: 2.5158...  0.5222 sec/batch
Epoch: 2/3...  Training Step: 261...  Training loss: 2.4998...  0.5348 sec/batch
Epoch: 2/3...  Training Step: 262...  Training loss: 2.4957...  0.5350 sec/batch
Epoch: 2/3...  Training Step: 263...  Training loss: 2.4719...  0.5418 sec/batch
Epoch: 2/3...  Training Step: 264...  Training loss: 2.4987...  0.8725 sec/batch
Epoch: 2/3...  Training Step: 265...  Training loss: 2.4917...  0.6263 sec/batch
Epoch: 2/3...  Training Step: 266...  Training loss: 2.5040...  0.5268 sec/batch
Epoch: 2/3...  Training Step: 267...  Training loss: 2.5027...  0.5219 sec/batch
Epoch: 2/3...  Training Step: 268...  Training loss: 2.4636...  0.5528 sec/batch
Epoch: 2/3...  Training Step: 269...  Training loss: 2.4655...  0.5623 sec/batch
Epoch: 2/3...  Training Step: 270...  Training loss: 2.4880...  0.6635 sec/batch
Epoch: 2/3...  Training Step: 271...  Training loss: 2.5139...  0.5658 sec/batch
Epoch: 2/3...  Training Step: 272...  Training loss: 2.4816...  0.5540 sec/batch
Epoch: 2/3...  Training Step: 273...  Training loss: 2.4702...  0.5753 sec/batch
Epoch: 2/3...  Training Step: 274...  Training loss: 2.4756...  0.5352 sec/batch
Epoch: 2/3...  Training Step: 275...  Training loss: 2.4949...  0.5213 sec/batch
Epoch: 2/3...  Training Step: 276...  Training loss: 2.5050...  0.5301 sec/batch
Epoch: 2/3...  Training Step: 277...  Training loss: 2.4932...  0.5286 sec/batch
Epoch: 2/3...  Training Step: 278...  Training loss: 2.4426...  0.5850 sec/batch
Epoch: 2/3...  Training Step: 279...  Training loss: 2.4636...  0.5756 sec/batch
Epoch: 2/3...  Training Step: 280...  Training loss: 2.4408...  0.5385 sec/batch
Epoch: 2/3...  Training Step: 281...  Training loss: 2.4649...  0.5248 sec/batch
Epoch: 2/3...  Training Step: 282...  Training loss: 2.4381...  0.5635 sec/batch
Epoch: 2/3...  Training Step: 283...  Training loss: 2.4357...  0.5165 sec/batch
Epoch: 2/3...  Training Step: 284...  Training loss: 2.4177...  0.5651 sec/batch
Epoch: 2/3...  Training Step: 285...  Training loss: 2.4532...  0.5782 sec/batch
Epoch: 2/3...  Training Step: 286...  Training loss: 2.4276...  0.6456 sec/batch
Epoch: 2/3...  Training Step: 287...  Training loss: 2.4419...  0.6069 sec/batch
Epoch: 2/3...  Training Step: 288...  Training loss: 2.4172...  0.5552 sec/batch
Epoch: 2/3...  Training Step: 289...  Training loss: 2.4218...  0.5303 sec/batch
Epoch: 2/3...  Training Step: 290...  Training loss: 2.4232...  0.5356 sec/batch
Epoch: 2/3...  Training Step: 291...  Training loss: 2.4210...  0.5302 sec/batch
Epoch: 2/3...  Training Step: 292...  Training loss: 2.4193...  0.5057 sec/batch
Epoch: 2/3...  Training Step: 293...  Training loss: 2.4354...  0.5308 sec/batch
Epoch: 2/3...  Training Step: 294...  Training loss: 2.4179...  0.5288 sec/batch
Epoch: 2/3...  Training Step: 295...  Training loss: 2.4059...  0.5281 sec/batch
Epoch: 2/3...  Training Step: 296...  Training loss: 2.4532...  0.6025 sec/batch
Epoch: 2/3...  Training Step: 297...  Training loss: 2.3959...  0.5718 sec/batch
Epoch: 2/3...  Training Step: 298...  Training loss: 2.3726...  0.5160 sec/batch
Epoch: 2/3...  Training Step: 299...  Training loss: 2.4056...  0.5808 sec/batch
Epoch: 2/3...  Training Step: 300...  Training loss: 2.3562...  0.5973 sec/batch
Epoch: 2/3...  Training Step: 301...  Training loss: 2.3654...  0.5500 sec/batch
Epoch: 2/3...  Training Step: 302...  Training loss: 2.3741...  0.5429 sec/batch
Epoch: 2/3...  Training Step: 303...  Training loss: 2.3708...  0.5238 sec/batch
Epoch: 2/3...  Training Step: 304...  Training loss: 2.3708...  0.5634 sec/batch
Epoch: 2/3...  Training Step: 305...  Training loss: 2.3956...  0.5416 sec/batch
Epoch: 2/3...  Training Step: 306...  Training loss: 2.3539...  0.5331 sec/batch
Epoch: 2/3...  Training Step: 307...  Training loss: 2.3797...  0.5338 sec/batch
Epoch: 2/3...  Training Step: 308...  Training loss: 2.3650...  0.5250 sec/batch
Epoch: 2/3...  Training Step: 309...  Training loss: 2.3444...  0.5268 sec/batch
Epoch: 2/3...  Training Step: 310...  Training loss: 2.3475...  0.5318 sec/batch
Epoch: 2/3...  Training Step: 311...  Training loss: 2.3727...  0.5516 sec/batch
Epoch: 2/3...  Training Step: 312...  Training loss: 2.3526...  0.5314 sec/batch
Epoch: 2/3...  Training Step: 313...  Training loss: 2.3594...  0.5710 sec/batch
Epoch: 2/3...  Training Step: 314...  Training loss: 2.3181...  0.5563 sec/batch
Epoch: 2/3...  Training Step: 315...  Training loss: 2.3397...  0.5707 sec/batch
Epoch: 2/3...  Training Step: 316...  Training loss: 2.3278...  0.5729 sec/batch
Epoch: 2/3...  Training Step: 317...  Training loss: 2.3207...  0.5380 sec/batch
Epoch: 2/3...  Training Step: 318...  Training loss: 2.3247...  0.5191 sec/batch
Epoch: 2/3...  Training Step: 319...  Training loss: 2.3338...  0.7567 sec/batch
Epoch: 2/3...  Training Step: 320...  Training loss: 2.3206...  0.8630 sec/batch
Epoch: 2/3...  Training Step: 321...  Training loss: 2.3348...  1.3790 sec/batch
Epoch: 2/3...  Training Step: 322...  Training loss: 2.3178...  0.7209 sec/batch
Epoch: 2/3...  Training Step: 323...  Training loss: 2.3502...  0.5994 sec/batch
Epoch: 2/3...  Training Step: 324...  Training loss: 2.3673...  0.6223 sec/batch
Epoch: 2/3...  Training Step: 325...  Training loss: 2.3271...  0.6655 sec/batch
Epoch: 2/3...  Training Step: 326...  Training loss: 2.3190...  0.6613 sec/batch
Epoch: 2/3...  Training Step: 327...  Training loss: 2.3121...  0.5743 sec/batch
Epoch: 2/3...  Training Step: 328...  Training loss: 2.3016...  0.5944 sec/batch
Epoch: 2/3...  Training Step: 329...  Training loss: 2.3538...  0.5809 sec/batch
Epoch: 2/3...  Training Step: 330...  Training loss: 2.3068...  0.6070 sec/batch
Epoch: 2/3...  Training Step: 331...  Training loss: 2.3426...  0.5959 sec/batch
Epoch: 2/3...  Training Step: 332...  Training loss: 2.2900...  0.5943 sec/batch
Epoch: 2/3...  Training Step: 333...  Training loss: 2.2629...  0.5865 sec/batch
Epoch: 2/3...  Training Step: 334...  Training loss: 2.3039...  0.5832 sec/batch
Epoch: 2/3...  Training Step: 335...  Training loss: 2.2826...  0.5822 sec/batch
Epoch: 2/3...  Training Step: 336...  Training loss: 2.2933...  0.8881 sec/batch
Epoch: 2/3...  Training Step: 337...  Training loss: 2.3136...  0.5998 sec/batch
Epoch: 2/3...  Training Step: 338...  Training loss: 2.2825...  0.5505 sec/batch
Epoch: 2/3...  Training Step: 339...  Training loss: 2.3044...  0.5704 sec/batch
Epoch: 2/3...  Training Step: 340...  Training loss: 2.2478...  0.6182 sec/batch
Epoch: 2/3...  Training Step: 341...  Training loss: 2.2687...  0.6129 sec/batch
Epoch: 2/3...  Training Step: 342...  Training loss: 2.2556...  0.5623 sec/batch
Epoch: 2/3...  Training Step: 343...  Training loss: 2.2510...  0.5510 sec/batch
Epoch: 2/3...  Training Step: 344...  Training loss: 2.2570...  0.5276 sec/batch
Epoch: 2/3...  Training Step: 345...  Training loss: 2.2692...  0.5238 sec/batch
Epoch: 2/3...  Training Step: 346...  Training loss: 2.2672...  0.5633 sec/batch
Epoch: 2/3...  Training Step: 347...  Training loss: 2.2261...  0.6164 sec/batch
Epoch: 2/3...  Training Step: 348...  Training loss: 2.2586...  1.5226 sec/batch
Epoch: 2/3...  Training Step: 349...  Training loss: 2.2799...  0.7989 sec/batch
Epoch: 2/3...  Training Step: 350...  Training loss: 2.2671...  0.6831 sec/batch
Epoch: 2/3...  Training Step: 351...  Training loss: 2.2187...  0.6488 sec/batch
Epoch: 2/3...  Training Step: 352...  Training loss: 2.2438...  0.6041 sec/batch
Epoch: 2/3...  Training Step: 353...  Training loss: 2.2259...  0.5381 sec/batch
Epoch: 2/3...  Training Step: 354...  Training loss: 2.2514...  0.5396 sec/batch
Epoch: 2/3...  Training Step: 355...  Training loss: 2.2320...  0.6262 sec/batch
Epoch: 2/3...  Training Step: 356...  Training loss: 2.2345...  0.5793 sec/batch
Epoch: 2/3...  Training Step: 357...  Training loss: 2.2616...  0.5617 sec/batch
Epoch: 2/3...  Training Step: 358...  Training loss: 2.2356...  0.5320 sec/batch
Epoch: 2/3...  Training Step: 359...  Training loss: 2.2335...  0.5832 sec/batch
Epoch: 2/3...  Training Step: 360...  Training loss: 2.2128...  0.6264 sec/batch
Epoch: 2/3...  Training Step: 361...  Training loss: 2.2365...  0.6484 sec/batch
Epoch: 2/3...  Training Step: 362...  Training loss: 2.2502...  0.5969 sec/batch
Epoch: 2/3...  Training Step: 363...  Training loss: 2.2046...  0.5561 sec/batch
Epoch: 2/3...  Training Step: 364...  Training loss: 2.2006...  0.5567 sec/batch
Epoch: 2/3...  Training Step: 365...  Training loss: 2.2559...  0.5442 sec/batch
Epoch: 2/3...  Training Step: 366...  Training loss: 2.2188...  0.5630 sec/batch
Epoch: 2/3...  Training Step: 367...  Training loss: 2.2290...  0.5454 sec/batch
Epoch: 2/3...  Training Step: 368...  Training loss: 2.2385...  0.5536 sec/batch
Epoch: 2/3...  Training Step: 369...  Training loss: 2.1787...  0.5250 sec/batch
Epoch: 2/3...  Training Step: 370...  Training loss: 2.2250...  0.5592 sec/batch
Epoch: 2/3...  Training Step: 371...  Training loss: 2.1939...  0.5329 sec/batch
Epoch: 2/3...  Training Step: 372...  Training loss: 2.1905...  0.6303 sec/batch
Epoch: 2/3...  Training Step: 373...  Training loss: 2.1891...  0.5889 sec/batch
Epoch: 2/3...  Training Step: 374...  Training loss: 2.1968...  0.5413 sec/batch
Epoch: 2/3...  Training Step: 375...  Training loss: 2.1700...  0.5418 sec/batch
Epoch: 2/3...  Training Step: 376...  Training loss: 2.1881...  0.5665 sec/batch
Epoch: 2/3...  Training Step: 377...  Training loss: 2.1693...  0.6130 sec/batch
Epoch: 2/3...  Training Step: 378...  Training loss: 2.2040...  0.5311 sec/batch
Epoch: 2/3...  Training Step: 379...  Training loss: 2.1461...  0.5158 sec/batch
Epoch: 2/3...  Training Step: 380...  Training loss: 2.1436...  0.5325 sec/batch
Epoch: 2/3...  Training Step: 381...  Training loss: 2.1492...  0.5185 sec/batch
Epoch: 2/3...  Training Step: 382...  Training loss: 2.1574...  0.5402 sec/batch
Epoch: 2/3...  Training Step: 383...  Training loss: 2.1617...  0.5286 sec/batch
Epoch: 2/3...  Training Step: 384...  Training loss: 2.1457...  0.5558 sec/batch
Epoch: 2/3...  Training Step: 385...  Training loss: 2.2023...  0.5674 sec/batch
Epoch: 2/3...  Training Step: 386...  Training loss: 2.1515...  0.5774 sec/batch
Epoch: 2/3...  Training Step: 387...  Training loss: 2.1884...  0.5430 sec/batch
Epoch: 2/3...  Training Step: 388...  Training loss: 2.1541...  0.5644 sec/batch
Epoch: 2/3...  Training Step: 389...  Training loss: 2.1331...  0.5649 sec/batch
Epoch: 2/3...  Training Step: 390...  Training loss: 2.1630...  0.5706 sec/batch
Epoch: 2/3...  Training Step: 391...  Training loss: 2.1415...  0.5470 sec/batch
Epoch: 2/3...  Training Step: 392...  Training loss: 2.1857...  0.6315 sec/batch
Epoch: 2/3...  Training Step: 393...  Training loss: 2.1743...  0.6039 sec/batch
Epoch: 2/3...  Training Step: 394...  Training loss: 2.1630...  0.5713 sec/batch
Epoch: 2/3...  Training Step: 395...  Training loss: 2.1516...  0.5173 sec/batch
Epoch: 2/3...  Training Step: 396...  Training loss: 2.1406...  0.5330 sec/batch
Epoch: 2/3...  Training Step: 397...  Training loss: 2.1348...  0.5364 sec/batch
Epoch: 2/3...  Training Step: 398...  Training loss: 2.1101...  0.5166 sec/batch
Epoch: 2/3...  Training Step: 399...  Training loss: 2.1272...  0.5315 sec/batch
Epoch: 2/3...  Training Step: 400...  Training loss: 2.1313...  0.5168 sec/batch
Epoch: 2/3...  Training Step: 401...  Training loss: 2.1428...  0.5077 sec/batch
Epoch: 2/3...  Training Step: 402...  Training loss: 2.1242...  0.5076 sec/batch
Epoch: 2/3...  Training Step: 403...  Training loss: 2.1104...  0.5166 sec/batch
Epoch: 2/3...  Training Step: 404...  Training loss: 2.1432...  0.5225 sec/batch
Epoch: 2/3...  Training Step: 405...  Training loss: 2.1205...  0.5155 sec/batch
Epoch: 2/3...  Training Step: 406...  Training loss: 2.1056...  0.5007 sec/batch
Epoch: 2/3...  Training Step: 407...  Training loss: 2.1128...  0.5159 sec/batch
Epoch: 2/3...  Training Step: 408...  Training loss: 2.1303...  0.5154 sec/batch
Epoch: 2/3...  Training Step: 409...  Training loss: 2.1528...  0.5224 sec/batch
Epoch: 2/3...  Training Step: 410...  Training loss: 2.1102...  0.5323 sec/batch
Epoch: 2/3...  Training Step: 411...  Training loss: 2.1207...  0.5552 sec/batch
Epoch: 2/3...  Training Step: 412...  Training loss: 2.1493...  0.5519 sec/batch
Epoch: 2/3...  Training Step: 413...  Training loss: 2.0765...  0.5261 sec/batch
Epoch: 2/3...  Training Step: 414...  Training loss: 2.1198...  0.5295 sec/batch
Epoch: 2/3...  Training Step: 415...  Training loss: 2.1163...  0.5361 sec/batch
Epoch: 2/3...  Training Step: 416...  Training loss: 2.0924...  0.5535 sec/batch
Epoch: 2/3...  Training Step: 417...  Training loss: 2.1032...  0.5324 sec/batch
Epoch: 2/3...  Training Step: 418...  Training loss: 2.0813...  0.5695 sec/batch
Epoch: 2/3...  Training Step: 419...  Training loss: 2.1038...  0.5372 sec/batch
Epoch: 2/3...  Training Step: 420...  Training loss: 2.0747...  0.5614 sec/batch
Epoch: 2/3...  Training Step: 421...  Training loss: 2.0939...  0.5168 sec/batch
Epoch: 2/3...  Training Step: 422...  Training loss: 2.0893...  0.5299 sec/batch
Epoch: 2/3...  Training Step: 423...  Training loss: 2.0819...  0.5185 sec/batch
Epoch: 2/3...  Training Step: 424...  Training loss: 2.0860...  0.5323 sec/batch
Epoch: 2/3...  Training Step: 425...  Training loss: 2.0971...  0.5053 sec/batch
Epoch: 2/3...  Training Step: 426...  Training loss: 2.1053...  0.5287 sec/batch
Epoch: 2/3...  Training Step: 427...  Training loss: 2.1047...  0.5173 sec/batch
Epoch: 2/3...  Training Step: 428...  Training loss: 2.0757...  0.5203 sec/batch
Epoch: 2/3...  Training Step: 429...  Training loss: 2.0848...  0.5171 sec/batch
Epoch: 2/3...  Training Step: 430...  Training loss: 2.1258...  0.5576 sec/batch
Epoch: 2/3...  Training Step: 431...  Training loss: 2.0898...  0.5229 sec/batch
Epoch: 2/3...  Training Step: 432...  Training loss: 2.1237...  0.5070 sec/batch
Epoch: 2/3...  Training Step: 433...  Training loss: 2.0806...  0.5316 sec/batch
Epoch: 2/3...  Training Step: 434...  Training loss: 2.0727...  0.5376 sec/batch
Epoch: 2/3...  Training Step: 435...  Training loss: 2.0607...  0.5163 sec/batch
Epoch: 2/3...  Training Step: 436...  Training loss: 2.0716...  0.5409 sec/batch
Epoch: 2/3...  Training Step: 437...  Training loss: 2.0499...  0.5789 sec/batch
Epoch: 2/3...  Training Step: 438...  Training loss: 2.0704...  0.5536 sec/batch
Epoch: 2/3...  Training Step: 439...  Training loss: 2.0768...  0.5320 sec/batch
Epoch: 2/3...  Training Step: 440...  Training loss: 2.0657...  0.5384 sec/batch
Epoch: 2/3...  Training Step: 441...  Training loss: 2.1020...  0.5608 sec/batch
Epoch: 2/3...  Training Step: 442...  Training loss: 2.1037...  0.6280 sec/batch
Epoch: 2/3...  Training Step: 443...  Training loss: 2.0824...  0.5259 sec/batch
Epoch: 2/3...  Training Step: 444...  Training loss: 2.1136...  0.5053 sec/batch
Epoch: 2/3...  Training Step: 445...  Training loss: 2.1537...  0.5317 sec/batch
Epoch: 2/3...  Training Step: 446...  Training loss: 2.1541...  0.5186 sec/batch
Epoch: 2/3...  Training Step: 447...  Training loss: 2.1027...  0.5304 sec/batch
Epoch: 2/3...  Training Step: 448...  Training loss: 2.0515...  0.5204 sec/batch
Epoch: 2/3...  Training Step: 449...  Training loss: 2.0443...  0.5599 sec/batch
Epoch: 2/3...  Training Step: 450...  Training loss: 2.0504...  0.5807 sec/batch
Epoch: 2/3...  Training Step: 451...  Training loss: 2.0040...  0.5536 sec/batch
Epoch: 2/3...  Training Step: 452...  Training loss: 2.0325...  0.5472 sec/batch
Epoch: 2/3...  Training Step: 453...  Training loss: 2.0352...  0.5325 sec/batch
Epoch: 2/3...  Training Step: 454...  Training loss: 2.0822...  0.5271 sec/batch
Epoch: 2/3...  Training Step: 455...  Training loss: 2.0987...  0.5196 sec/batch
Epoch: 2/3...  Training Step: 456...  Training loss: 2.0028...  0.5213 sec/batch
Epoch: 2/3...  Training Step: 457...  Training loss: 2.0156...  0.5164 sec/batch
Epoch: 2/3...  Training Step: 458...  Training loss: 2.0488...  0.5323 sec/batch
Epoch: 2/3...  Training Step: 459...  Training loss: 2.0403...  0.5155 sec/batch
Epoch: 2/3...  Training Step: 460...  Training loss: 2.0284...  0.5204 sec/batch
Epoch: 2/3...  Training Step: 461...  Training loss: 2.0202...  0.5184 sec/batch
Epoch: 2/3...  Training Step: 462...  Training loss: 2.0291...  0.5315 sec/batch
Epoch: 2/3...  Training Step: 463...  Training loss: 1.9876...  0.5322 sec/batch
Epoch: 2/3...  Training Step: 464...  Training loss: 2.0663...  0.5377 sec/batch
Epoch: 2/3...  Training Step: 465...  Training loss: 2.0348...  0.5322 sec/batch
Epoch: 2/3...  Training Step: 466...  Training loss: 2.0283...  0.5161 sec/batch
Epoch: 2/3...  Training Step: 467...  Training loss: 2.0191...  0.5376 sec/batch
Epoch: 2/3...  Training Step: 468...  Training loss: 2.0212...  0.5299 sec/batch
Epoch: 2/3...  Training Step: 469...  Training loss: 2.0403...  0.5321 sec/batch
Epoch: 2/3...  Training Step: 470...  Training loss: 2.0316...  0.5206 sec/batch
Epoch: 2/3...  Training Step: 471...  Training loss: 2.0727...  0.5338 sec/batch
Epoch: 2/3...  Training Step: 472...  Training loss: 2.0291...  0.5377 sec/batch
Epoch: 2/3...  Training Step: 473...  Training loss: 2.0153...  0.5239 sec/batch
Epoch: 2/3...  Training Step: 474...  Training loss: 2.0460...  0.5332 sec/batch
Epoch: 2/3...  Training Step: 475...  Training loss: 2.0189...  0.5194 sec/batch
Epoch: 2/3...  Training Step: 476...  Training loss: 2.0089...  0.5343 sec/batch
Epoch: 2/3...  Training Step: 477...  Training loss: 2.0217...  0.5327 sec/batch
Epoch: 2/3...  Training Step: 478...  Training loss: 1.9985...  0.5251 sec/batch
Epoch: 2/3...  Training Step: 479...  Training loss: 2.0191...  0.5187 sec/batch
Epoch: 2/3...  Training Step: 480...  Training loss: 2.0477...  0.5198 sec/batch
Epoch: 2/3...  Training Step: 481...  Training loss: 2.0033...  0.5161 sec/batch
Epoch: 2/3...  Training Step: 482...  Training loss: 1.9717...  0.5219 sec/batch
Epoch: 2/3...  Training Step: 483...  Training loss: 2.0184...  0.5413 sec/batch
Epoch: 2/3...  Training Step: 484...  Training loss: 1.9607...  0.5210 sec/batch
Epoch: 3/3...  Training Step: 485...  Training loss: 2.0466...  0.5328 sec/batch
Epoch: 3/3...  Training Step: 486...  Training loss: 1.9957...  0.5208 sec/batch
Epoch: 3/3...  Training Step: 487...  Training loss: 1.9402...  0.5302 sec/batch
Epoch: 3/3...  Training Step: 488...  Training loss: 1.9465...  0.5385 sec/batch
Epoch: 3/3...  Training Step: 489...  Training loss: 1.9554...  0.5155 sec/batch
Epoch: 3/3...  Training Step: 490...  Training loss: 2.0180...  0.5133 sec/batch
Epoch: 3/3...  Training Step: 491...  Training loss: 1.9810...  0.5407 sec/batch
Epoch: 3/3...  Training Step: 492...  Training loss: 2.0053...  0.5129 sec/batch
Epoch: 3/3...  Training Step: 493...  Training loss: 1.9819...  0.5352 sec/batch
Epoch: 3/3...  Training Step: 494...  Training loss: 1.9702...  0.5774 sec/batch
Epoch: 3/3...  Training Step: 495...  Training loss: 2.0152...  0.5380 sec/batch
Epoch: 3/3...  Training Step: 496...  Training loss: 1.9679...  0.5779 sec/batch
Epoch: 3/3...  Training Step: 497...  Training loss: 1.9693...  0.5910 sec/batch
Epoch: 3/3...  Training Step: 498...  Training loss: 1.9437...  0.5742 sec/batch
Epoch: 3/3...  Training Step: 499...  Training loss: 1.9693...  0.5507 sec/batch
Epoch: 3/3...  Training Step: 500...  Training loss: 1.9879...  0.5325 sec/batch
Epoch: 3/3...  Training Step: 501...  Training loss: 1.9627...  0.5412 sec/batch
Epoch: 3/3...  Training Step: 502...  Training loss: 1.9588...  0.5390 sec/batch
Epoch: 3/3...  Training Step: 503...  Training loss: 1.9501...  0.5454 sec/batch
Epoch: 3/3...  Training Step: 504...  Training loss: 1.9396...  0.5475 sec/batch
Epoch: 3/3...  Training Step: 505...  Training loss: 1.9307...  0.5832 sec/batch
Epoch: 3/3...  Training Step: 506...  Training loss: 1.9543...  0.5487 sec/batch
Epoch: 3/3...  Training Step: 507...  Training loss: 1.9593...  0.5757 sec/batch
Epoch: 3/3...  Training Step: 508...  Training loss: 1.9705...  0.5579 sec/batch
Epoch: 3/3...  Training Step: 509...  Training loss: 1.9353...  0.5710 sec/batch
Epoch: 3/3...  Training Step: 510...  Training loss: 1.8865...  0.5340 sec/batch
Epoch: 3/3...  Training Step: 511...  Training loss: 1.9244...  0.5348 sec/batch
Epoch: 3/3...  Training Step: 512...  Training loss: 1.9651...  0.5326 sec/batch
Epoch: 3/3...  Training Step: 513...  Training loss: 1.9735...  0.5502 sec/batch
Epoch: 3/3...  Training Step: 514...  Training loss: 1.9274...  0.5316 sec/batch
Epoch: 3/3...  Training Step: 515...  Training loss: 1.9533...  0.5343 sec/batch
Epoch: 3/3...  Training Step: 516...  Training loss: 1.9619...  0.5327 sec/batch
Epoch: 3/3...  Training Step: 517...  Training loss: 1.9787...  0.5323 sec/batch
Epoch: 3/3...  Training Step: 518...  Training loss: 1.9699...  0.5605 sec/batch
Epoch: 3/3...  Training Step: 519...  Training loss: 1.9681...  0.5499 sec/batch
Epoch: 3/3...  Training Step: 520...  Training loss: 1.9334...  0.5269 sec/batch
Epoch: 3/3...  Training Step: 521...  Training loss: 1.9364...  0.5501 sec/batch
Epoch: 3/3...  Training Step: 522...  Training loss: 1.9115...  0.5677 sec/batch
Epoch: 3/3...  Training Step: 523...  Training loss: 1.9445...  0.5478 sec/batch
Epoch: 3/3...  Training Step: 524...  Training loss: 1.9407...  0.5367 sec/batch
Epoch: 3/3...  Training Step: 525...  Training loss: 1.9080...  0.5321 sec/batch
Epoch: 3/3...  Training Step: 526...  Training loss: 1.9185...  0.5516 sec/batch
Epoch: 3/3...  Training Step: 527...  Training loss: 1.9238...  0.5668 sec/batch
Epoch: 3/3...  Training Step: 528...  Training loss: 1.9223...  0.5542 sec/batch
Epoch: 3/3...  Training Step: 529...  Training loss: 1.9358...  0.5698 sec/batch
Epoch: 3/3...  Training Step: 530...  Training loss: 1.9019...  0.5600 sec/batch
Epoch: 3/3...  Training Step: 531...  Training loss: 1.9108...  0.5374 sec/batch
Epoch: 3/3...  Training Step: 532...  Training loss: 1.9062...  0.5180 sec/batch
Epoch: 3/3...  Training Step: 533...  Training loss: 1.9078...  0.5320 sec/batch
Epoch: 3/3...  Training Step: 534...  Training loss: 1.9188...  0.5507 sec/batch
Epoch: 3/3...  Training Step: 535...  Training loss: 1.9293...  0.5485 sec/batch
Epoch: 3/3...  Training Step: 536...  Training loss: 1.9148...  0.5543 sec/batch
Epoch: 3/3...  Training Step: 537...  Training loss: 1.9361...  0.5654 sec/batch
Epoch: 3/3...  Training Step: 538...  Training loss: 1.9580...  0.5802 sec/batch
Epoch: 3/3...  Training Step: 539...  Training loss: 1.9507...  0.5385 sec/batch
Epoch: 3/3...  Training Step: 540...  Training loss: 1.8964...  0.5654 sec/batch
Epoch: 3/3...  Training Step: 541...  Training loss: 1.9274...  0.5699 sec/batch
Epoch: 3/3...  Training Step: 542...  Training loss: 1.8837...  0.5470 sec/batch
Epoch: 3/3...  Training Step: 543...  Training loss: 1.9146...  0.5371 sec/batch
Epoch: 3/3...  Training Step: 544...  Training loss: 1.9144...  0.5315 sec/batch
Epoch: 3/3...  Training Step: 545...  Training loss: 1.8953...  0.5345 sec/batch
Epoch: 3/3...  Training Step: 546...  Training loss: 1.8988...  0.5513 sec/batch
Epoch: 3/3...  Training Step: 547...  Training loss: 1.9225...  0.5770 sec/batch
Epoch: 3/3...  Training Step: 548...  Training loss: 1.8931...  0.5647 sec/batch
Epoch: 3/3...  Training Step: 549...  Training loss: 1.9206...  0.5621 sec/batch
Epoch: 3/3...  Training Step: 550...  Training loss: 1.9221...  0.5896 sec/batch
Epoch: 3/3...  Training Step: 551...  Training loss: 1.9064...  0.5682 sec/batch
Epoch: 3/3...  Training Step: 552...  Training loss: 1.9229...  0.5830 sec/batch
Epoch: 3/3...  Training Step: 553...  Training loss: 1.8975...  0.6209 sec/batch
Epoch: 3/3...  Training Step: 554...  Training loss: 1.9047...  0.6185 sec/batch
Epoch: 3/3...  Training Step: 555...  Training loss: 1.8871...  0.5784 sec/batch
Epoch: 3/3...  Training Step: 556...  Training loss: 1.8622...  0.5948 sec/batch
Epoch: 3/3...  Training Step: 557...  Training loss: 1.9102...  0.5930 sec/batch
Epoch: 3/3...  Training Step: 558...  Training loss: 1.8646...  0.5835 sec/batch
Epoch: 3/3...  Training Step: 559...  Training loss: 1.8687...  0.5815 sec/batch
Epoch: 3/3...  Training Step: 560...  Training loss: 1.8826...  0.5690 sec/batch
Epoch: 3/3...  Training Step: 561...  Training loss: 1.8837...  0.5770 sec/batch
Epoch: 3/3...  Training Step: 562...  Training loss: 1.8792...  0.5800 sec/batch
Epoch: 3/3...  Training Step: 563...  Training loss: 1.8989...  0.6180 sec/batch
Epoch: 3/3...  Training Step: 564...  Training loss: 1.9063...  0.5970 sec/batch
Epoch: 3/3...  Training Step: 565...  Training loss: 1.9050...  0.5765 sec/batch
Epoch: 3/3...  Training Step: 566...  Training loss: 1.9372...  0.6180 sec/batch
Epoch: 3/3...  Training Step: 567...  Training loss: 1.8835...  0.5920 sec/batch
Epoch: 3/3...  Training Step: 568...  Training loss: 1.8927...  0.5845 sec/batch
Epoch: 3/3...  Training Step: 569...  Training loss: 1.8623...  0.6130 sec/batch
Epoch: 3/3...  Training Step: 570...  Training loss: 1.8533...  0.5725 sec/batch
Epoch: 3/3...  Training Step: 571...  Training loss: 1.9385...  0.6110 sec/batch
Epoch: 3/3...  Training Step: 572...  Training loss: 1.8603...  0.6045 sec/batch
Epoch: 3/3...  Training Step: 573...  Training loss: 1.9176...  0.5975 sec/batch
Epoch: 3/3...  Training Step: 574...  Training loss: 1.8625...  0.7265 sec/batch
Epoch: 3/3...  Training Step: 575...  Training loss: 1.8673...  0.7735 sec/batch
Epoch: 3/3...  Training Step: 576...  Training loss: 1.8929...  0.7295 sec/batch
Epoch: 3/3...  Training Step: 577...  Training loss: 1.8916...  0.8330 sec/batch
Epoch: 3/3...  Training Step: 578...  Training loss: 1.8727...  0.7815 sec/batch
Epoch: 3/3...  Training Step: 579...  Training loss: 1.9076...  0.7620 sec/batch
Epoch: 3/3...  Training Step: 580...  Training loss: 1.8770...  0.5940 sec/batch
Epoch: 3/3...  Training Step: 581...  Training loss: 1.9127...  0.5800 sec/batch
Epoch: 3/3...  Training Step: 582...  Training loss: 1.8547...  0.5745 sec/batch
Epoch: 3/3...  Training Step: 583...  Training loss: 1.8849...  0.5715 sec/batch
Epoch: 3/3...  Training Step: 584...  Training loss: 1.8392...  0.5655 sec/batch
Epoch: 3/3...  Training Step: 585...  Training loss: 1.8518...  0.5705 sec/batch
Epoch: 3/3...  Training Step: 586...  Training loss: 1.8531...  0.5770 sec/batch
Epoch: 3/3...  Training Step: 587...  Training loss: 1.8444...  0.5395 sec/batch
Epoch: 3/3...  Training Step: 588...  Training loss: 1.8727...  0.5510 sec/batch
Epoch: 3/3...  Training Step: 589...  Training loss: 1.8440...  0.5510 sec/batch
Epoch: 3/3...  Training Step: 590...  Training loss: 1.8541...  0.5405 sec/batch
Epoch: 3/3...  Training Step: 591...  Training loss: 1.9146...  0.5455 sec/batch
Epoch: 3/3...  Training Step: 592...  Training loss: 1.8878...  0.5490 sec/batch
Epoch: 3/3...  Training Step: 593...  Training loss: 1.8151...  0.5570 sec/batch
Epoch: 3/3...  Training Step: 594...  Training loss: 1.8465...  0.5540 sec/batch
Epoch: 3/3...  Training Step: 595...  Training loss: 1.8211...  0.5550 sec/batch
Epoch: 3/3...  Training Step: 596...  Training loss: 1.8825...  0.5600 sec/batch
Epoch: 3/3...  Training Step: 597...  Training loss: 1.8370...  0.5675 sec/batch
Epoch: 3/3...  Training Step: 598...  Training loss: 1.8625...  0.5407 sec/batch
Epoch: 3/3...  Training Step: 599...  Training loss: 1.8895...  0.5650 sec/batch
---------------------------------------------------------------------------
ResourceExhaustedError                    Traceback (most recent call last)
C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1038     try:
-> 1039       return fn(*args)
   1040     except errors.OpError as e:

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1020                                  feed_dict, fetch_list, target_list,
-> 1021                                  status, run_metadata)
   1022 

C:\anthony-ide\Anaconda3\lib\contextlib.py in __exit__(self, type, value, traceback)
     65             try:
---> 66                 next(self.gen)
     67             except StopIteration:

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py in raise_exception_on_not_ok_status()
    465           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466           pywrap_tensorflow.TF_GetCode(status))
    467   finally:

ResourceExhaustedError: OOM when allocating tensor with shape[8192,512]
	 [[Node: gradients/MatMul_grad/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](gradients/add_grad/Reshape, softmax/Variable/read)]]

During handling of the above exception, another exception occurred:

ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-19-185b24228ba9> in <module>()
     30                                                  model.final_state,
     31                                                  model.optimizer], 
---> 32                                                  feed_dict=feed)
     33 
     34             train_writer.add_summary(summary, counter)

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    776     try:
    777       result = self._run(None, fetches, feed_dict, options_ptr,
--> 778                          run_metadata_ptr)
    779       if run_metadata:
    780         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    980     if final_fetches or final_targets:
    981       results = self._do_run(handle, final_targets, final_fetches,
--> 982                              feed_dict_string, options, run_metadata)
    983     else:
    984       results = []

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1030     if handle is None:
   1031       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032                            target_list, options, run_metadata)
   1033     else:
   1034       return self._do_call(_prun_fn, self._session, handle, feed_dict,

C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1050         except KeyError:
   1051           pass
-> 1052       raise type(e)(node_def, op, message)
   1053 
   1054   def _extend_graph(self):

ResourceExhaustedError: OOM when allocating tensor with shape[8192,512]
	 [[Node: gradients/MatMul_grad/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](gradients/add_grad/Reshape, softmax/Variable/read)]]

Caused by op 'gradients/MatMul_grad/MatMul', defined at:
  File "C:\anthony-ide\Anaconda3\lib\runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\anthony-ide\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "C:\anthony-ide\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "C:\anthony-ide\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tornado\ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\anthony-ide\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\anthony-ide\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-84afb0542030>", line 3, in <module>
    learning_rate=learning_rate)
  File "<ipython-input-15-64f5a1a535a5>", line 38, in __init__
    self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)
  File "<ipython-input-14-b4ac5786ae0d>", line 12, in build_optimizer
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 368, in _MaybeCompile
    return grad_fn()  # Exit early
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py", line 783, in _MatMulGrad
    grad_a = math_ops.matmul(grad, b, transpose_b=True)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1801, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1263, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

...which was originally created as op 'MatMul', defined at:
  File "C:\anthony-ide\Anaconda3\lib\runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
[elided 19 identical lines from previous traceback]
  File "<ipython-input-17-84afb0542030>", line 3, in <module>
    learning_rate=learning_rate)
  File "<ipython-input-15-64f5a1a535a5>", line 34, in __init__
    self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
  File "<ipython-input-12-c84da34d6b3e>", line 34, in build_output
    logits = tf.matmul(x,  softmax_w) + softmax_b
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1801, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1263, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\anthony-ide\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[8192,512]
	 [[Node: gradients/MatMul_grad/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](gradients/add_grad/Reshape, softmax/Variable/read)]]

Saved checkpoints

Read up on saving and loading checkpoints here: https://www.tensorflow.org/programmers_guide/variables


In [ ]:
tf.train.get_checkpoint_state('checkpoints')

Sampling

Now that the network is trained, we'll can use it to generate new text. The idea is that we pass in a character, then the network will predict the next character. We can use the new one, to predict the next one. And we keep doing this to generate all new text. I also included some functionality to prime the network with some text by passing in a string and building up a state from that.

The network gives us predictions for each character. To reduce noise and make things a little less random, I'm going to only choose a new character from the top N most likely characters.


In [ ]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [ ]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

Here, pass in the path to a checkpoint and sample from the network.


In [ ]:
tf.train.latest_checkpoint('checkpoints')

In [ ]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

In [ ]:
checkpoint = 'checkpoints/i200_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

In [ ]:
checkpoint = 'checkpoints/i600_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

In [ ]:
checkpoint = 'checkpoints/i1200_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)