This notebook demonstrates using Cloud TPUs to build a language model: a model that predicts the next character of text given the text so far. Once our model has been trained we can sample from it to generate new text that looks like the text it was trained on. In this case we're going to train our network using the combined works of Shakespeare, creating a play generating robot.
Our network outputs something Shakespeare-esque:
Loves that led me no dumbs lack her Berjoy's face with her to-day. The spirits roar'd; which shames which within his powers Which tied up remedies lending with occasion, A loud and Lancaster, stabb'd in me Upon my sword for ever: 'Agripo'er, his days let me free. Stop it of that word, be so: at Lear, When I did profess the hour-stranger for my life, When I did sink to be cried how for aught; Some beds which seeks chaste senses prove burning; But he perforces seen in her eyes so fast; And _
Let's get started on generating our own Shakespeare! We'll start off with our data generator. The training data to our model will be snippets from our text file: the target snippet is offset by one character.
In [0]:
# !rm /content/adc.json
In [0]:
import json
import os
import pprint
import re
import time
import tensorflow as tf
use_tpu = True #@param {type:"boolean"}
bucket = '' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'
print('Using bucket: {}'.format(bucket))
if use_tpu:
assert 'COLAB_TPU_ADDR' in os.environ, 'Missing TPU; did you request a TPU in Notebook Settings?'
MODEL_DIR = 'gs://{}/{}'.format(bucket, time.strftime('tpuestimator-lstm/%Y-%m-%d-%H-%M-%S'))
print('Using model dir: {}'.format(MODEL_DIR))
from google.colab import auth
auth.authenticate_user()
if 'COLAB_TPU_ADDR' in os.environ:
TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
# Upload credentials to TPU.
with tf.Session(TF_MASTER) as sess:
with open('/content/adc.json', 'r') as f:
auth_info = json.load(f)
tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)
# Now credentials are set for all future sessions on this TPU.
else:
TF_MASTER=''
with tf.Session(TF_MASTER) as session:
pprint.pprint(session.list_devices())
We can use a tf.data
pipeline to feed input data to our Estimator. In this case, we want our model to predict the next character, so we will feed sequences from our dataset where the source is offset from the target by 1 character.
Note that we use tf.contrib.data.enumerate_dataset()
and tf.contrib.stateless.stateless_random_uniform
to generate deterministic uniform samples. This, combined with the setting of RunConfig.tf_random_seed
guarantees that every run of the model will have the exact same behavior.
In [0]:
import numpy as np
!wget --show-progress --continue -O /content/shakespeare.txt http://www.gutenberg.org/files/100/100-0.txt
SHAKESPEARE_TXT = '/content/shakespeare.txt'
RANDOM_SEED = 42 # An arbitrary choice.
def transform(txt):
return np.asarray([ord(c) for c in txt], dtype=np.int32)
def input_fn(params):
"""Return a dataset of source and target sequences for training."""
batch_size = params['batch_size']
print('Batch size: {}'.format(batch_size))
seq_len = params['seq_len']
with tf.gfile.GFile(params['source_file'], 'r') as f:
txt = f.read()
txt = ''.join([x for x in txt if ord(x) < 128])
tf.logging.info('Sample text: %s', txt[10000:10100])
source = tf.constant(transform(txt), dtype=tf.int32)
ds = tf.data.Dataset.from_tensors(source)
ds = ds.repeat()
ds = ds.apply(tf.contrib.data.enumerate_dataset())
def _select_seq(offset, src):
idx = tf.contrib.stateless.stateless_random_uniform(
[1], seed=[RANDOM_SEED, offset], dtype=tf.float32)[0]
max_start_offset = len(txt) - seq_len
idx = tf.cast(idx * max_start_offset, tf.int32)
print(idx)
return {
'source': tf.reshape(src[idx:idx + seq_len], [seq_len]),
'target': tf.reshape(src[idx + 1:idx + seq_len + 1], [seq_len])
}
ds = ds.map(_select_seq)
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.prefetch(2)
return ds
tf.reset_default_graph()
tf.set_random_seed(0)
with tf.Session() as session:
ds = input_fn({'batch_size': 1, 'seq_len': 10, 'source_file': SHAKESPEARE_TXT})
features = session.run(ds.make_one_shot_iterator().get_next())
print(features['source'])
print(features['target'])
Now that we have some data, we can define our model. We use a simple 3 layer, forward LSTM for this notebook.
The only change to our model versus a CPU/GPU model is that we specify a static shape
for the input of our model. This allows TF to infer the shape of the model and satisfy the XLA compilers static shape requirement.
In [0]:
EMBEDDING_DIM = 1024
# Construct a 2-layer LSTM
def _lstm(inputs, batch_size, initial_state=None):
def _make_cell(layer_idx):
with tf.variable_scope('lstm/%d' % layer_idx,):
return tf.nn.rnn_cell.LSTMCell(
num_units=EMBEDDING_DIM,
state_is_tuple=True,
reuse=tf.AUTO_REUSE,
)
cell = tf.nn.rnn_cell.MultiRNNCell([
_make_cell(0),
_make_cell(1),
])
if initial_state is None:
initial_state = cell.zero_state(batch_size, tf.float32)
outputs, final_state = tf.contrib.recurrent.functional_rnn(
cell, inputs, initial_state=initial_state, use_tpu=use_tpu)
return outputs, final_state
def lstm_model(seq, initial_state=None):
with tf.variable_scope('lstm',
initializer=tf.orthogonal_initializer,
reuse=tf.AUTO_REUSE):
batch_size = seq.shape[0]
seq_len = seq.shape[1]
embedding_params = tf.get_variable(
'char_embedding',
initializer=tf.orthogonal_initializer(seed=0),
shape=(256, EMBEDDING_DIM), dtype=tf.float32)
embedding = tf.nn.embedding_lookup(embedding_params, seq)
lstm_output, lstm_state = _lstm(
embedding, batch_size, initial_state=initial_state)
# Apply a single dense layer to the output of our LSTM to predict
# our final characters. This looks awkward as we have to flatten
# our input to 2 dimensions before applying the dense layer.
flattened = tf.reshape(lstm_output, [-1, EMBEDDING_DIM])
logits = tf.layers.dense(flattened, 256, name='logits',)
logits = tf.reshape(logits, [-1, seq_len, 256])
return logits, lstm_state
Since we're using TPUEstimator, we need to provide what's called a model function to train our model. This specifies how to train, evaluate and run inference (predictions) on our model.
Let's cover each part in turn. We'll first look at the training step.
RMSPropOptimizer
to optimize our networkCrossShardOptimizer
which lets us use multiple TPU cores to train. Finally we return a TPUEstimatorSpec
indicating how TPUEstimator should train our model.
In [0]:
def train_fn(source, target):
logits, lstm_state = lstm_model(source)
batch_size = source.shape[0]
loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=target, logits=logits))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
if TF_MASTER:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
train_op = optimizer.minimize(loss, tf.train.get_global_step())
return tf.contrib.tpu.TPUEstimatorSpec(
mode=tf.estimator.ModeKeys.TRAIN,
loss=loss,
train_op=train_op,
)
In [0]:
def eval_fn(source, target):
logits, _ = lstm_model(source)
def metric_fn(labels, logits):
labels = tf.cast(labels, tf.int64)
return {
'recall@1': tf.metrics.recall_at_k(labels, logits, 1),
'recall@5': tf.metrics.recall_at_k(labels, logits, 5)
}
eval_metrics = (metric_fn, [target, logits])
return tf.contrib.tpu.TPUEstimatorSpec(
mode=tf.estimator.ModeKeys.EVAL,
loss=loss,
eval_metrics=eval_metrics)
In [0]:
def predict_fn(source):
# Seed the model with our initial array
batch_size = source.shape[0]
logits, lstm_state = lstm_model(source)
def _body(i, state, preds):
"""Body of our prediction loop: predict the next character."""
cur_preds = preds.read(i)
next_logits, next_state = lstm_model(
tf.cast(tf.expand_dims(cur_preds, -1), tf.int32), state)
# pull out the last (and only) prediction.
next_logits = next_logits[:, -1]
next_pred = tf.multinomial(
next_logits, num_samples=1, output_dtype=tf.int32)[:, 0]
preds = preds.write(i + 1, next_pred)
return (i + 1, next_state, preds)
def _cond(i, state, preds):
del state
del preds
# Loop until `predict_len - 1`: preds[0] is the initial state and we
# write to `i + 1` on each iteration.
return tf.less(i, predict_len - 1)
next_pred = tf.multinomial(
logits[:, -1], num_samples=1, output_dtype=tf.int32)[:, 0]
i = tf.constant(0, dtype=tf.int32)
predict_len = 500
# compute predictions as [seq_len, batch_size] to simplify indexing/updates
pred_var = tf.TensorArray(
dtype=tf.int32,
size=predict_len,
dynamic_size=False,
clear_after_read=False,
element_shape=(batch_size,),
name='prediction_accumulator',
)
pred_var = pred_var.write(0, next_pred)
_, _, final_predictions = tf.while_loop(_cond, _body,
[i, lstm_state, pred_var])
# reshape back to [batch_size, predict_len] and cast to int32
final_predictions = final_predictions.stack()
final_predictions = tf.transpose(final_predictions, [1, 0])
final_predictions = tf.reshape(final_predictions, (batch_size, predict_len))
return tf.contrib.tpu.TPUEstimatorSpec(
mode=tf.estimator.ModeKeys.PREDICT,
predictions={'predictions': final_predictions})
In [0]:
def model_fn(features, labels, mode, params):
if mode == tf.estimator.ModeKeys.TRAIN:
return train_fn(features['source'], features['target'])
if mode == tf.estimator.ModeKeys.EVAL:
return eval_fn(features['source'], features['target'])
if mode == tf.estimator.ModeKeys.PREDICT:
return predict_fn(features['source'])
In [0]:
def _make_estimator(num_shards, use_tpu=True):
config = tf.contrib.tpu.RunConfig(
tf_random_seed=RANDOM_SEED,
master=TF_MASTER,
model_dir=MODEL_DIR,
save_checkpoints_steps=5000,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=num_shards, iterations_per_loop=100))
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=use_tpu,
model_fn=model_fn, config=config,
train_batch_size=1024,
eval_batch_size=1024,
predict_batch_size=128,
params={'seq_len': 100, 'source_file': SHAKESPEARE_TXT},
)
return estimator
# Use all 8 cores for training
estimator = _make_estimator(num_shards=8, use_tpu=use_tpu)
estimator.train(
input_fn=input_fn,
max_steps=2000,
)
Out[0]:
In [0]:
def _seed_input_fn(params):
del params
seed_txt = 'Looks it not like the king?'
seed = transform(seed_txt)
seed = tf.constant(seed.reshape([1, -1]), dtype=tf.int32)
# Predict must return a Dataset, not a Tensor.
return tf.data.Dataset.from_tensors({'source': seed})
# Use 1 core for prediction since we're only generating a single element batch
estimator = _make_estimator(num_shards=1, use_tpu=False)
idx = next(estimator.predict(input_fn=_seed_input_fn))['predictions']
print(''.join([chr(i) for i in idx]))
In [0]: