In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from time import gmtime, strftime
import os
import re
import pickle
import random
import sys
In [2]:
# load ascii text from file
filename = "data/obama.txt"
raw_text = open(filename).read()
# get rid of any characters other than letters, numbers,
# and a few special characters
raw_text = re.sub('[^\nA-Za-z0-9 ,.:;?!-]+', '', raw_text)
# convert all text to lowercase
raw_text = raw_text.lower()
n_chars = len(raw_text)
print "length of text:", n_chars
print "text preview:", raw_text[:500]
In [3]:
# extract all unique characters in the text
chars = sorted(list(set(raw_text)))
n_vocab = len(chars)
print "number of unique characters found:", n_vocab
# create mapping of characters to integers and back
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
# test our mapping
print 'a', "- maps to ->", char_to_int["a"]
print 25, "- maps to ->", int_to_char[25]
In [4]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
inputs = []
outputs = []
for i in range(0, n_chars - seq_length, 1):
inputs.append(raw_text[i:i + seq_length])
outputs.append(raw_text[i + seq_length])
n_sequences = len(inputs)
print "Total sequences: ", n_sequences
In [5]:
indeces = range(len(inputs))
random.shuffle(indeces)
inputs = [inputs[x] for x in indeces]
outputs = [outputs[x] for x in indeces]
In [6]:
# create two empty numpy array with the proper dimensions
X = np.zeros((n_sequences, seq_length, n_vocab), dtype=np.bool)
y = np.zeros((n_sequences, n_vocab), dtype=np.bool)
# iterate over the data and build up the X and y data sets
# by setting the appropriate indices to 1 in each one-hot vector
for i, example in enumerate(inputs):
for t, char in enumerate(example):
X[i, t, char_to_int[char]] = 1
y[i, char_to_int[outputs[i]]] = 1
print 'X dims -->', X.shape
print 'y dims -->', y.shape
In [7]:
# define the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=False, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.50))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
In [8]:
def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
In [ ]:
def generate(sentence, prediction_length=50, diversity=0.35):
print '----- diversity:', diversity
generated = sentence
sys.stdout.write(generated)
# iterate over number of characters requested
for i in range(prediction_length):
# build up sequence data from current sentence
x = np.zeros((1, X.shape[1], X.shape[2]))
for t, char in enumerate(sentence):
x[0, t, char_to_int[char]] = 1.
# use trained model to return probability distribution
# for next character based on input sequence
preds = model.predict(x, verbose=0)[0]
# use sample() function to sample next character
# based on probability distribution and desired diversity
next_index = sample(preds, diversity)
# convert integer to character
next_char = int_to_char[next_index]
# add new character to generated text
generated += next_char
# delete the first character from beginning of sentance,
# and add new caracter to the end. This will form the
# input sequence for the next predicted character.
sentence = sentence[1:] + next_char
# print results to screen
sys.stdout.write(next_char)
sys.stdout.flush()
print