In [2]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils


Using TensorFlow backend.

In [3]:
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [6]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [17]:
print char_to_int


{'\n': 0, '!': 2, ' ': 1, '"': 3, "'": 4, ')': 6, '(': 5, '*': 7, '-': 9, ',': 8, '.': 10, '0': 11, '3': 12, ';': 14, ':': 13, '?': 15, '[': 16, ']': 17, '_': 18, 'a': 19, 'c': 21, 'b': 20, 'e': 23, 'd': 22, 'g': 25, 'f': 24, 'i': 27, 'h': 26, 'k': 29, 'j': 28, 'm': 31, 'l': 30, 'o': 33, 'n': 32, 'q': 35, 'p': 34, 's': 37, 'r': 36, 'u': 39, 't': 38, 'w': 41, 'v': 40, 'y': 43, 'x': 42, 'z': 44}

In [11]:
n_chars= len(raw_text)
n_vocab= len(chars)

In [12]:
## Each training pattern of the network is comprised of 100 time steps of one character (X) followed 
## by one character output (y). When creating these sequences, we slide this window along the 
## whole book one character at a time, allowing each character a chance 
## to be learned from the 100 characters that preceded it (except the first 100 characters of course).
## For example, if the sequence length is 5 (for simplicity) then the first 
## two training patterns would be as follows:
## CHAPT -> E
## HAPTE -> R
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print "Total Patterns: ", n_patterns


Total Patterns:  144331

In [19]:
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [ ]: