In [1]:
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve


url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data

text = read_data(filename)
print('Data size %d' % len(text))


valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])


vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0

def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))


batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()

  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch

  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))


def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]


num_nodes = 64

graph = tf.Graph()
with graph.as_default():

  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))

  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)

  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))


num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))


Found and verified text8.zip
Data size 100000000
99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl
Unexpected character: ï
1 26 0 0
a z  
['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nationa', 'd monasteri', 'raca prince', 'chard baer ', 'rgical lang', 'for passeng', 'the nationa', 'took place ', 'ther well k', 'seven six s', 'ith a gloss', 'robably bee', 'to recogniz', 'ceived the ', 'icant than ', 'ritic of th', 'ight in sig', 's uncaused ', ' lost as in', 'cellular ic', 'e size of t', ' him a stic', 'drugs confu', ' take to co', ' the priest', 'im to name ', 'd barred at', 'standard fo', ' such as es', 'ze on the g', 'e of the or', 'd hiver one', 'y eight mar', 'the lead ch', 'es classica', 'ce the non ', 'al analysis', 'mormons bel', 't or at lea', ' disagreed ', 'ing system ', 'btypes base', 'anguages th', 'r commissio', 'ess one nin', 'nux suse li', ' the first ', 'zi concentr', ' society ne', 'elatively s', 'etworks sha', 'or hirohito', 'litical ini', 'n most of t', 'iskerdoo ri', 'ic overview', 'air compone', 'om acnm acc', ' centerline', 'e than any ', 'devotional ', 'de such dev']
[' a']
['an']
Initialized
Average loss at step 0: 3.298272 learning rate: 10.000000
Minibatch perplexity: 27.07
================================================================================
twza nmjewitawrtmjae n c y izmieni etipyhpbyqo uptr w qsqvnhirjae  foismmcmprolc
veevnm quap iw  ksoifjyz lqsccelmtctqmxhfgfoipznpo  iffu tobudqc gtrye tcsmdo  v
ivitiazwblsphkpjcchtbzfzf acnaxvapdh geep ckj  nnpumoztltrnt tdhk dziopliil baxy
ej it  njo t  ex lapk ayfvooceitp  e ekjabedgbxi dytn p ehocptic jghmbb ms frphq
bcpmyrjt  me a tgi prfhioam otjhinhk yap ibrlahawpsej cye trq bcet njtz  ecfeee 
================================================================================
Validation set perplexity: 20.16
Average loss at step 100: 2.599593 learning rate: 10.000000
Minibatch perplexity: 11.12
Validation set perplexity: 10.55
Average loss at step 200: 2.261282 learning rate: 10.000000
Minibatch perplexity: 8.73
Validation set perplexity: 8.51
Average loss at step 300: 2.101810 learning rate: 10.000000
Minibatch perplexity: 7.24
Validation set perplexity: 7.81
Average loss at step 400: 1.999247 learning rate: 10.000000
Minibatch perplexity: 7.61
Validation set perplexity: 7.46
Average loss at step 500: 1.928977 learning rate: 10.000000
Minibatch perplexity: 6.44
Validation set perplexity: 6.76
Average loss at step 600: 1.902124 learning rate: 10.000000
Minibatch perplexity: 6.17
Validation set perplexity: 6.53
Average loss at step 700: 1.851544 learning rate: 10.000000
Minibatch perplexity: 6.46
Validation set perplexity: 6.21
Average loss at step 800: 1.812462 learning rate: 10.000000
Minibatch perplexity: 6.01
Validation set perplexity: 6.06
Average loss at step 900: 1.825770 learning rate: 10.000000
Minibatch perplexity: 6.87
Validation set perplexity: 6.04
Average loss at step 1000: 1.821244 learning rate: 10.000000
Minibatch perplexity: 5.66
================================================================================
melions of very from greed by tenmale in one nine one nine file risk the surcime
fous antuentile fre vively the with thele abt deqail vecollize it mos nate maja 
hervarly it was kreage by ripteal ten arnof comedien conne the heagn velo lite t
le shunes in contraly to rote and ise neture fehtures of a lecoquer the vlictari
x echalso of jeduted counter contreentaylol illodo in histuzustic of pertrictes 
================================================================================
Validation set perplexity: 5.76
Average loss at step 1100: 1.772919 learning rate: 10.000000
Minibatch perplexity: 5.45
Validation set perplexity: 5.76
Average loss at step 1200: 1.747313 learning rate: 10.000000
Minibatch perplexity: 5.06
Validation set perplexity: 5.49
Average loss at step 1300: 1.730694 learning rate: 10.000000
Minibatch perplexity: 5.59
Validation set perplexity: 5.60
Average loss at step 1400: 1.741744 learning rate: 10.000000
Minibatch perplexity: 5.76
Validation set perplexity: 5.36
Average loss at step 1500: 1.734485 learning rate: 10.000000
Minibatch perplexity: 4.84
Validation set perplexity: 5.39
Average loss at step 1600: 1.745835 learning rate: 10.000000
Minibatch perplexity: 5.60
Validation set perplexity: 5.41
Average loss at step 1700: 1.713409 learning rate: 10.000000
Minibatch perplexity: 5.63
Validation set perplexity: 5.39
Average loss at step 1800: 1.671108 learning rate: 10.000000
Minibatch perplexity: 5.47
Validation set perplexity: 5.15
Average loss at step 1900: 1.643473 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 5.09
Average loss at step 2000: 1.695999 learning rate: 10.000000
Minibatch perplexity: 5.69
================================================================================
ques and dissingiced conkectailion orygicod in english provics bock uning imsent
alding the incloddain mackimed roogh in is tht oftron of the five misiting under
el plaunt vieworge as rapunts repress workfil maduens interphe termsisian and no
s linarceable the in mann the fourded it the caronism pane one six vids one say 
ge there gean remultian tarla inthis elecound five hamn gradim was enrived four 
================================================================================
Validation set perplexity: 5.14
Average loss at step 2100: 1.681050 learning rate: 10.000000
Minibatch perplexity: 5.13
Validation set perplexity: 4.89
Average loss at step 2200: 1.682004 learning rate: 10.000000
Minibatch perplexity: 6.53
Validation set perplexity: 4.92
Average loss at step 2300: 1.637763 learning rate: 10.000000
Minibatch perplexity: 4.97
Validation set perplexity: 4.81
Average loss at step 2400: 1.658001 learning rate: 10.000000
Minibatch perplexity: 5.02
Validation set perplexity: 4.82
Average loss at step 2500: 1.674849 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.68
Average loss at step 2600: 1.651183 learning rate: 10.000000
Minibatch perplexity: 5.64
Validation set perplexity: 4.69
Average loss at step 2700: 1.654898 learning rate: 10.000000
Minibatch perplexity: 4.59
Validation set perplexity: 4.75
Average loss at step 2800: 1.646599 learning rate: 10.000000
Minibatch perplexity: 5.52
Validation set perplexity: 4.62
Average loss at step 2900: 1.646508 learning rate: 10.000000
Minibatch perplexity: 5.77
Validation set perplexity: 4.82
Average loss at step 3000: 1.648585 learning rate: 10.000000
Minibatch perplexity: 5.06
================================================================================
zer reforms and ala decessed films notiso propedian and their which and oniler e
menbe ordinitre is most issimeties its to marriated to new who canfalteinual is 
 balach fustan resentle scosce popyinish transtitle rocai of the much howeng the
ni in a throbic the purtive minear r freer secturt ark acrise corrome occussion 
wikes he hakmandaus and be in one neipends and about movonly cult p sachion offi
================================================================================
Validation set perplexity: 4.70
Average loss at step 3100: 1.627999 learning rate: 10.000000
Minibatch perplexity: 5.74
Validation set perplexity: 4.54
Average loss at step 3200: 1.645780 learning rate: 10.000000
Minibatch perplexity: 5.55
Validation set perplexity: 4.69
Average loss at step 3300: 1.638656 learning rate: 10.000000
Minibatch perplexity: 5.11
Validation set perplexity: 4.48
Average loss at step 3400: 1.664635 learning rate: 10.000000
Minibatch perplexity: 5.55
Validation set perplexity: 4.53
Average loss at step 3500: 1.652085 learning rate: 10.000000
Minibatch perplexity: 5.46
Validation set perplexity: 4.67
Average loss at step 3600: 1.665040 learning rate: 10.000000
Minibatch perplexity: 4.55
Validation set perplexity: 4.51
Average loss at step 3700: 1.643515 learning rate: 10.000000
Minibatch perplexity: 5.07
Validation set perplexity: 4.52
Average loss at step 3800: 1.639218 learning rate: 10.000000
Minibatch perplexity: 5.69
Validation set perplexity: 4.56
Average loss at step 3900: 1.633929 learning rate: 10.000000
Minibatch perplexity: 5.30
Validation set perplexity: 4.62
Average loss at step 4000: 1.650588 learning rate: 10.000000
Minibatch perplexity: 4.71
================================================================================
x powezkated inhaturatiskv to there ran one two zero zero one one six one nine s
y is his westallan japban wher of the virbity the overlative logations his drama
as of batics remiker in zerogzogiki istructive mode eviqution which reparce the 
dai fasse wibizations methins in the undia the bas beulliption two five septal t
kenllands phicts years jagance colvelly adgets frommonts suk and us buken with g
================================================================================
Validation set perplexity: 4.60
Average loss at step 4100: 1.629838 learning rate: 10.000000
Minibatch perplexity: 5.35
Validation set perplexity: 4.59
Average loss at step 4200: 1.634614 learning rate: 10.000000
Minibatch perplexity: 5.07
Validation set perplexity: 4.48
Average loss at step 4300: 1.614128 learning rate: 10.000000
Minibatch perplexity: 5.05
Validation set perplexity: 4.53
Average loss at step 4400: 1.611128 learning rate: 10.000000
Minibatch perplexity: 5.01
Validation set perplexity: 4.41
Average loss at step 4500: 1.618393 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 4.53
Average loss at step 4600: 1.617417 learning rate: 10.000000
Minibatch perplexity: 4.88
Validation set perplexity: 4.58
Average loss at step 4700: 1.626686 learning rate: 10.000000
Minibatch perplexity: 5.31
Validation set perplexity: 4.46
Average loss at step 4800: 1.632845 learning rate: 10.000000
Minibatch perplexity: 4.45
Validation set perplexity: 4.59
Average loss at step 4900: 1.628497 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.68
Average loss at step 5000: 1.602976 learning rate: 1.000000
Minibatch perplexity: 4.38
================================================================================
tary viytra is kanup by wores serbe alloweraly can the stele see eight supers st
zer unter were or fort belift most hadswomes discuption one nine f which deal th
be docker was or elected ropp i salk deviciol on invivis to den shulon commain o
urg irnains crassiny and a imate to because or dange terrins for the friyues to 
weal american deppered discointain depically be women on mine the imert his the 
================================================================================
Validation set perplexity: 4.74
Average loss at step 5100: 1.604320 learning rate: 1.000000
Minibatch perplexity: 4.79
Validation set perplexity: 4.50
Average loss at step 5200: 1.588789 learning rate: 1.000000
Minibatch perplexity: 4.61
Validation set perplexity: 4.42
Average loss at step 5300: 1.573977 learning rate: 1.000000
Minibatch perplexity: 4.60
Validation set perplexity: 4.40
Average loss at step 5400: 1.575741 learning rate: 1.000000
Minibatch perplexity: 5.28
Validation set perplexity: 4.39
Average loss at step 5500: 1.559429 learning rate: 1.000000
Minibatch perplexity: 4.70
Validation set perplexity: 4.33
Average loss at step 5600: 1.575676 learning rate: 1.000000
Minibatch perplexity: 4.90
Validation set perplexity: 4.33
Average loss at step 5700: 1.563475 learning rate: 1.000000
Minibatch perplexity: 4.48
Validation set perplexity: 4.32
Average loss at step 5800: 1.577050 learning rate: 1.000000
Minibatch perplexity: 4.86
Validation set perplexity: 4.30
Average loss at step 5900: 1.570550 learning rate: 1.000000
Minibatch perplexity: 5.07
Validation set perplexity: 4.30
Average loss at step 6000: 1.546433 learning rate: 1.000000
Minibatch perplexity: 4.96
================================================================================
portainated ravi and with through government that was two eight siectenc fatt if
jases and the theodings oing in the one nine three to the trixts two four since 
conted depetraline womow one two one zero offigilm be callenting the extext weak
xaed poxe writton a tall metamenced also instanlany such and forvers and oftinit
x samed his routh advienside took cornish b one four zero zero zero zero dose pe
================================================================================
Validation set perplexity: 4.29
Average loss at step 6100: 1.561474 learning rate: 1.000000
Minibatch perplexity: 5.03
Validation set perplexity: 4.27
Average loss at step 6200: 1.533745 learning rate: 1.000000
Minibatch perplexity: 4.96
Validation set perplexity: 4.27
Average loss at step 6300: 1.545656 learning rate: 1.000000
Minibatch perplexity: 5.13
Validation set perplexity: 4.25
Average loss at step 6400: 1.538726 learning rate: 1.000000
Minibatch perplexity: 4.46
Validation set perplexity: 4.23
Average loss at step 6500: 1.554911 learning rate: 1.000000
Minibatch perplexity: 4.73
Validation set perplexity: 4.25
Average loss at step 6600: 1.591178 learning rate: 1.000000
Minibatch perplexity: 4.72
Validation set perplexity: 4.24
Average loss at step 6700: 1.578251 learning rate: 1.000000
Minibatch perplexity: 5.06
Validation set perplexity: 4.27
Average loss at step 6800: 1.603085 learning rate: 1.000000
Minibatch perplexity: 4.80
Validation set perplexity: 4.27
Average loss at step 6900: 1.577870 learning rate: 1.000000
Minibatch perplexity: 4.66
Validation set perplexity: 4.26
Average loss at step 7000: 1.571186 learning rate: 1.000000
Minibatch perplexity: 5.18
================================================================================
ple of criers or this general secreth to three divided curage crick duregnisly o
doest internation of is peried of the umal resultsters ksiss ffacular abount up 
pherly len metiling in the calling from concessalism in efted are castilleturnin
h some used and three arbood action he french cituse maininge ethered by aslapho
x twte exampued in invensate on the following the agripwwed than one nine six si
================================================================================
Validation set perplexity: 4.24