Creating an English language sequence generator capable of building semi-coherent English sentences from scratch by building them up character-by-character
Natural Language Processing
Dataset: Complete version of Sir Arthur Conan Doyle's classic book The Adventures of Sherlock Holmes
Based on RNN project: text generation of the Udacity's Artificial Intelligence Nanodegree
In [1]:
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import helper
helper.reproducible(seed=9)
sns.set()
In [2]:
text = open('data/holmes.txt').read().lower()
print('Total characters: {}'.format(len(text)))
text[:300]
Out[2]:
In [3]:
text = text[1302:] # remove title, author page, and table of contents
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
unique_characters = set(list(text))
print(unique_characters)
In [4]:
# remove non-english characters
import re
text = re.sub("[$%&'()*@/àâèé0123456789-]", " ", text)
text = text.replace('"', ' ')
text = text.replace(' ', ' ') # shorten any extra dead space created above
text[:300]
Out[4]:
In [5]:
chars = sorted(list(set(text)))
num_chars = len(chars)
print('Total characters: {}'.format(len(text)))
print('Unique characters: {}'.format(num_chars))
print(chars)
In [6]:
# Transforms the input text and window-size into a set of input/output pairs
# for use with the RNN """
window_size = 100
step_size = 5
input_pairs = []
output_pairs = []
for i in range(0, len(text) - window_size, step_size):
input_pairs.append(text[i:i + window_size])
output_pairs.append(text[i + window_size])
In [7]:
chars_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_chars = dict((i, c) for i, c in enumerate(chars))
# create variables for one-hot encoded input/output
X = np.zeros((len(input_pairs), window_size, num_chars), dtype=np.bool)
y = np.zeros((len(input_pairs), num_chars), dtype=np.bool)
# transform character-based input_pairs/output_pairs into equivalent numerical versions
for i, sentence in enumerate(input_pairs):
for t, char in enumerate(sentence):
X[i, t, chars_to_indices[char]] = 1
y[i, chars_to_indices[output_pairs[i]]] = 1
In [8]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
model = Sequential()
model.add(LSTM(200, input_shape=(window_size, num_chars)))
model.add(Dense(num_chars, activation=None))
model.add(Dense(num_chars, activation="softmax"))
model.summary()
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
# train the model
print("Training ...")
%time history = model.fit(X, y, batch_size=512, epochs=100,verbose=0)
helper.show_training(history)
model_path = os.path.join("models", "text_generator.h5")
model.save(model_path)
print("\nModel saved at", model_path)
In [9]:
model = keras.models.load_model(model_path)
print("Model loaded:", model_path)
def predict_next_chars(model, input_chars, num_to_predict):
""" predict a number of future characters """
predicted_chars = ''
for i in range(num_to_predict):
x_test = np.zeros((1, window_size, len(chars)))
for t, char in enumerate(input_chars):
x_test[0, t, chars_to_indices[char]] = 1.
test_predict = model.predict(x_test, verbose=0)[0]
# translate numerical prediction back to characters
r = np.argmax(test_predict)
d = indices_to_chars[r]
# update predicted_chars and input
predicted_chars += d
input_chars += d
input_chars = input_chars[1:]
return predicted_chars
for s in range(0, 500, 100):
start_index = s
input_chars = text[start_index:start_index + window_size]
predict_input = predict_next_chars(model, input_chars, num_to_predict=100)
print('------------------')
input_line = 'input chars = ' + '\n' + input_chars + '"' + '\n'
print(input_line)
line = 'predicted chars = ' + '\n' + predict_input + '"' + '\n'
print(line)