In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This example uses tf.keras to build a language model and train it on a Google Cloud TPU. This language model predicts the next character of text given the text so far. The trained model can generate new snippets of text that read in a similar style to the text training data.
We'll train the model on the combined works of William Shakespeare, then use it to compose a play in the style of The Great Bard:
Loves that led me no dumbs lack her Berjoy's face with her to-day. The spirits roar'd; which shames which within his powers Which tied up remedies lending with occasion, A loud and Lancaster, stabb'd in me Upon my sword for ever: 'Agripo'er, his days let me free. Stop it of that word, be so: at Lear, When I did profess the hour-stranger for my life, When I did sink to be cried how for aught; Some beds which seeks chaste senses prove burning; But he perforces seen in her eyes so fast; And _
Note: To enable TPUs on Google Colab, select Runtime > Change runtime type, and set Hardware acceleration to TPU.
Download The Complete Works of William Shakespeare as a single text file from Project Gutenberg. We'll use snippets from this file as the training data for the model. The target snippet is offset by one character.
In [0]:
!wget --show-progress --continue -O /content/shakespeare.txt http://www.gutenberg.org/files/100/100-0.txt
In [0]:
import numpy as np
import six
import tensorflow as tf
import time
import os
# This address identifies the TPU we'll use when configuring TensorFlow.
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
SHAKESPEARE_TXT = '/content/shakespeare.txt'
tf.logging.set_verbosity(tf.logging.INFO)
def transform(txt, pad_to=None):
# drop any non-ascii characters
output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
if pad_to is not None:
output = output[:pad_to]
output = np.concatenate([
np.zeros([pad_to - len(txt)], dtype=np.int32),
output,
])
return output
def training_generator(seq_len=100, batch_size=1024):
"""A generator yields (source, target) arrays for training."""
with tf.gfile.GFile(SHAKESPEARE_TXT, 'r') as f:
txt = f.read()
tf.logging.info('Input text [%d] %s', len(txt), txt[:50])
source = transform(txt)
while True:
offsets = np.random.randint(0, len(source) - seq_len, batch_size)
# Our model uses sparse crossentropy loss, but Keras requires labels
# to have the same rank as the input logits. We add an empty final
# dimension to account for this.
yield (
np.stack([source[idx:idx + seq_len] for idx in offsets]),
np.expand_dims(
np.stack([source[idx + 1:idx + seq_len + 1] for idx in offsets]),
-1),
)
six.next(training_generator(seq_len=10, batch_size=1))
Out[0]:
The model is defined as a two-layer, forward-LSTM—with two changes from the tf.keras
standard LSTM definition:
shape
of our model which satisfies the XLA compiler's static shape requirement.tf.train.Optimizer
instead of a standard Keras optimizer (Keras optimizer support is still experimental).
In [0]:
EMBEDDING_DIM = 512
def lstm_model(seq_len=100, batch_size=None, stateful=True):
"""Language model: predict the next word given the current word."""
source = tf.keras.Input(
name='seed', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)
embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM)(source)
lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
model.compile(
optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
return model
In [0]:
tf.keras.backend.clear_session()
training_model = lstm_model(seq_len=100, batch_size=128, stateful=False)
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
training_model,
strategy=tf.contrib.tpu.TPUDistributionStrategy(
tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
tpu_model.fit_generator(
training_generator(seq_len=100, batch_size=1024),
steps_per_epoch=100,
epochs=10,
)
tpu_model.save_weights('/tmp/bard.h5', overwrite=True)
In [0]:
BATCH_SIZE = 5
PREDICT_LEN = 250
# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('/tmp/bard.h5')
# We seed the model with our initial string, copied BATCH_SIZE times
seed_txt = 'Looks it not like the king? Verily, we must go! '
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)
# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
prediction_model.predict(seed[:, i:i + 1])
# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
last_word = predictions[-1]
next_probits = prediction_model.predict(last_word)[:, 0, :]
# sample from our output distribution
next_idx = [
np.random.choice(256, p=next_probits[i])
for i in range(BATCH_SIZE)
]
predictions.append(np.asarray(next_idx, dtype=np.int32))
for i in range(BATCH_SIZE):
print('PREDICTION %d\n\n' % i)
p = [predictions[j][i] for j in range(PREDICT_LEN)]
generated = ''.join([chr(c) for c in p])
print(generated)
print()
assert len(generated) == PREDICT_LEN, 'Generated text too short'