In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from time import gmtime, strftime
import os
import re
import pickle
import random
import sys


Using TensorFlow backend.

In [2]:
# load ascii text from file
filename = "data/obama.txt"
raw_text = open(filename).read()

# get rid of any characters other than letters, numbers, 
# and a few special characters
raw_text = re.sub('[^\nA-Za-z0-9 ,.:;?!-]+', '', raw_text)

# convert all text to lowercase
raw_text = raw_text.lower()

n_chars = len(raw_text)
print "length of text:", n_chars
print "text preview:", raw_text[:500]


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-2-a8135bc164c8> in <module>()
      1 # load ascii text from file
      2 filename = "data/obama.txt"
----> 3 raw_text = open(filename).read()
      4 
      5 # get rid of any characters other than letters, numbers,

IOError: [Errno 2] No such file or directory: 'data/obama.txt'

In [3]:
# extract all unique characters in the text
chars = sorted(list(set(raw_text)))
n_vocab = len(chars)
print "number of unique characters found:", n_vocab

# create mapping of characters to integers and back
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# test our mapping
print 'a', "- maps to ->", char_to_int["a"]
print 25, "- maps to ->", int_to_char[25]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-c7830beb99e9> in <module>()
      1 # extract all unique characters in the text
----> 2 chars = sorted(list(set(raw_text)))
      3 n_vocab = len(chars)
      4 print "number of unique characters found:", n_vocab
      5 

NameError: name 'raw_text' is not defined

In [4]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100

inputs = []
outputs = []

for i in range(0, n_chars - seq_length, 1):
    inputs.append(raw_text[i:i + seq_length])
    outputs.append(raw_text[i + seq_length])
    
n_sequences = len(inputs)
print "Total sequences: ", n_sequences


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-252f1f23c70d> in <module>()
      5 outputs = []
      6 
----> 7 for i in range(0, n_chars - seq_length, 1):
      8     inputs.append(raw_text[i:i + seq_length])
      9     outputs.append(raw_text[i + seq_length])

NameError: name 'n_chars' is not defined

In [5]:
indeces = range(len(inputs))
random.shuffle(indeces)

inputs = [inputs[x] for x in indeces]
outputs = [outputs[x] for x in indeces]

In [6]:
# create two empty numpy array with the proper dimensions
X = np.zeros((n_sequences, seq_length, n_vocab), dtype=np.bool)
y = np.zeros((n_sequences, n_vocab), dtype=np.bool)

# iterate over the data and build up the X and y data sets
# by setting the appropriate indices to 1 in each one-hot vector
for i, example in enumerate(inputs):
    for t, char in enumerate(example):
        X[i, t, char_to_int[char]] = 1
    y[i, char_to_int[outputs[i]]] = 1
    
print 'X dims -->', X.shape
print 'y dims -->', y.shape


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-fbec2967dcfa> in <module>()
      1 # create two empty numpy array with the proper dimensions
----> 2 X = np.zeros((n_sequences, seq_length, n_vocab), dtype=np.bool)
      3 y = np.zeros((n_sequences, n_vocab), dtype=np.bool)
      4 
      5 # iterate over the data and build up the X and y data sets

NameError: name 'n_sequences' is not defined

In [7]:
# define the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=False, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.50))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-89f3b7ecfef0> in <module>()
      1 # define the LSTM model
      2 model = Sequential()
----> 3 model.add(LSTM(128, return_sequences=False, input_shape=(X.shape[1], X.shape[2])))
      4 model.add(Dropout(0.50))
      5 model.add(Dense(y.shape[1], activation='softmax'))

NameError: name 'X' is not defined

In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [ ]:
def generate(sentence, prediction_length=50, diversity=0.35):
    print '----- diversity:', diversity 

    generated = sentence
    sys.stdout.write(generated)

    # iterate over number of characters requested
    for i in range(prediction_length):
        
        # build up sequence data from current sentence
        x = np.zeros((1, X.shape[1], X.shape[2]))
        for t, char in enumerate(sentence):
            x[0, t, char_to_int[char]] = 1.

        # use trained model to return probability distribution
        # for next character based on input sequence
        preds = model.predict(x, verbose=0)[0]
        
        # use sample() function to sample next character 
        # based on probability distribution and desired diversity
        next_index = sample(preds, diversity)
        
        # convert integer to character
        next_char = int_to_char[next_index]

        # add new character to generated text
        generated += next_char
        
        # delete the first character from beginning of sentance, 
        # and add new caracter to the end. This will form the 
        # input sequence for the next predicted character.
        sentence = sentence[1:] + next_char

        # print results to screen
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print