In [2]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
In [3]:
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()
In [6]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
In [17]:
print char_to_int
In [11]:
n_chars= len(raw_text)
n_vocab= len(chars)
In [12]:
## Each training pattern of the network is comprised of 100 time steps of one character (X) followed
## by one character output (y). When creating these sequences, we slide this window along the
## whole book one character at a time, allowing each character a chance
## to be learned from the 100 characters that preceded it (except the first 100 characters of course).
## For example, if the sequence length is 5 (for simplicity) then the first
## two training patterns would be as follows:
## CHAPT -> E
## HAPTE -> R
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
seq_in = raw_text[i:i + seq_length]
seq_out = raw_text[i + seq_length]
dataX.append([char_to_int[char] for char in seq_in])
dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print "Total Patterns: ", n_patterns
In [19]:
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
In [ ]: