English sequence generator

Creating an English language sequence generator capable of building semi-coherent English sentences from scratch by building them up character-by-character

Natural Language Processing

Dataset: Complete version of Sir Arthur Conan Doyle's classic book The Adventures of Sherlock Holmes

Based on RNN project: text generation of the Udacity's Artificial Intelligence Nanodegree


In [1]:
%matplotlib inline

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import helper

helper.reproducible(seed=9)
sns.set()


Using TensorFlow backend.

Load and Process the data


In [2]:
text = open('data/holmes.txt').read().lower()
print('Total characters: {}'.format(len(text)))
text[:300]


Total characters: 581864
Out[2]:
"\ufeffproject gutenberg's the adventures of sherlock holmes, by arthur conan doyle\n\nthis ebook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  you may copy it, give it away or\nre-use it under the terms of the project gutenberg license included\nwith this ebook or "

Preprocess the data


In [3]:
text = text[1302:]  # remove title, author page, and table of contents
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')

unique_characters = set(list(text))
print(unique_characters)


{'!', '-', '$', 'y', 'è', 'q', 'p', '"', ')', 'n', 'f', 'm', 'x', ' ', '1', 'u', '7', "'", '(', 'i', 's', ';', 'z', '5', ':', 'v', 'k', 'j', '/', 'à', '0', '%', 'c', '3', 'g', '2', '6', '@', '9', 'e', '*', '4', 'a', 'â', '?', 'r', 'l', 'd', '.', 'w', '8', '&', 'é', 'h', 't', ',', 'o', 'b'}

In [4]:
# remove non-english characters
import re
text = re.sub("[$%&'()*@/àâèé0123456789-]", " ", text)
text = text.replace('"', ' ')
text = text.replace('  ', ' ')  # shorten any extra dead space created above
text[:300]


Out[4]:
'is eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion akin to love for irene adler. all emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. he was, i take it, the most perfect reasoning and observing machine '

In [5]:
chars = sorted(list(set(text)))
num_chars = len(chars)
print('Total characters: {}'.format(len(text)))
print('Unique characters: {}'.format(num_chars))
print(chars)


Total characters: 573681
Unique characters: 33
[' ', '!', ',', '.', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Split data into input/output pairs


In [6]:
# Transforms the input text and window-size into a set of input/output pairs
#  for use with the RNN """

window_size = 100
step_size = 5

input_pairs = []
output_pairs = []

for i in range(0, len(text) - window_size, step_size):
    input_pairs.append(text[i:i + window_size])
    output_pairs.append(text[i + window_size])

One-hot encoding characters


In [7]:
chars_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_chars = dict((i, c) for i, c in enumerate(chars))

# create variables for one-hot encoded input/output
X = np.zeros((len(input_pairs), window_size, num_chars), dtype=np.bool)
y = np.zeros((len(input_pairs), num_chars), dtype=np.bool)

# transform character-based input_pairs/output_pairs into equivalent numerical versions
for i, sentence in enumerate(input_pairs):
    for t, char in enumerate(sentence):
        X[i, t, chars_to_indices[char]] = 1
    y[i, chars_to_indices[output_pairs[i]]] = 1

Recurrent Neural Network Model


In [8]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM

model = Sequential()
model.add(LSTM(200, input_shape=(window_size, num_chars)))
model.add(Dense(num_chars, activation=None))
model.add(Dense(num_chars, activation="softmax"))
model.summary()

optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# train the model
print("Training ...")
%time history = model.fit(X, y, batch_size=512, epochs=100,verbose=0)
helper.show_training(history)

model_path = os.path.join("models", "text_generator.h5")
model.save(model_path)
print("\nModel saved at", model_path)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 200)               187200    
_________________________________________________________________
dense_1 (Dense)              (None, 33)                6633      
_________________________________________________________________
dense_2 (Dense)              (None, 33)                1122      
=================================================================
Total params: 194,955
Trainable params: 194,955
Non-trainable params: 0
_________________________________________________________________
Training ...
CPU times: user 1h 5min 52s, sys: 8min 57s, total: 1h 14min 50s
Wall time: 1h 18min 14s
Training loss:  	0.3027

Model saved at models/text_generator.h5

Make predictions


In [9]:
model = keras.models.load_model(model_path)
print("Model loaded:", model_path)


def predict_next_chars(model, input_chars, num_to_predict):
    """ predict a number of future characters """

    predicted_chars = ''
    for i in range(num_to_predict):
        x_test = np.zeros((1, window_size, len(chars)))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_indices[char]] = 1.

        test_predict = model.predict(x_test, verbose=0)[0]

        # translate numerical prediction back to characters
        r = np.argmax(test_predict)
        d = indices_to_chars[r]

        # update predicted_chars and input
        predicted_chars += d
        input_chars += d
        input_chars = input_chars[1:]
    return predicted_chars


for s in range(0, 500, 100):
    start_index = s
    input_chars = text[start_index:start_index + window_size]
    predict_input = predict_next_chars(model, input_chars, num_to_predict=100)

    print('------------------')
    input_line = 'input chars = ' + '\n' + input_chars + '"' + '\n'
    print(input_line)

    line = 'predicted chars = ' + '\n' + predict_input + '"' + '\n'
    print(line)


Model loaded: models/text_generator.h5
------------------
input chars = 
is eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion akin"

predicted chars = 
 to me againsit with a sertenc.  and fas one of one of the morning in the each of seed.  lockes and "

------------------
input chars = 
 to love for irene adler. all emotions, and that one particularly, were abhorrent to his cold, preci"

predicted chars = 
ses i wast for do a thing. i was not and solited mystery, and he stook anotion behond.  that you mad"

------------------
input chars = 
se but admirably balanced mind. he was, i take it, the most perfect reasoning and observing machine "

predicted chars = 
the day of uping that whome the dang fortunly should be should mook within meservaton to s ventily, "

------------------
input chars = 
that the world has seen, but as a lover he would have placed himself in a false position. he never s"

predicted chars = 
pepts for he is ear her sending it was morth, and you are very it all, and the manter was speciall e"

------------------
input chars = 
poke of the softer passions, save with a gibe and a sneer. they were admirable things for the observ"

predicted chars = 
ed of the boxousing to brave ausulation, for the incinity of his face, said he, may be one of the bo"