In [1]:
from theano.sandbox import cuda
cuda.use('gpu1')
In [2]:
%matplotlib inline
import utils;
from utils import *
from keras.layers import TimeDistributed, Activation
from keras.callbacks import ModelCheckpoint
from numpy.random import choice
In [3]:
path = 'text/modu.txt'
text = open(path).read()
text = text.replace(' ', '')
text = text[-200000:]
print('corpus length:', len(text))
In [4]:
!tail {path} -n10
In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars: ', vocab_size)
Sometimes it's useful to have a zero value in the dataset, e.g. for padding
In [6]:
chars.insert(0, "\0")
''.join(chars[:16])
Out[6]:
In [7]:
char_indices = dict((c, i) for i,c in enumerate(chars))
indices_char = dict((i, c) for i,c in enumerate(chars))
idx = [char_indices[c] for c in text]
In [8]:
idx[:10]
Out[8]:
In [9]:
''.join(indices_char[i] for i in idx[:20])
Out[9]:
In [10]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, len(idx) - seq_length, 1):
seq_in = idx[i:i+seq_length]
seq_out = idx[i+seq_length]
dataX.append(seq_in)
dataY.append(seq_out)
n_patterns = len(dataX)
n_patterns
Out[10]:
Now that we have prepared our training data we need to transform it so that is it suitable for use with Keras.
First we must transform the list of input sequences into the form [samples, time steps, features] expected by an LSTM network
Next, we need to rescale the integers to [0, 1] to make the patterns easiers to learn by the LSTM network that uses the sigmoid activation function by default
Finally, we need to convert the output patterns into one-hot encoding. This is so that we can configure the network to predict the probability of each of the 47 different characters in the vocabulary (an easier representation) rather than trying to force it to predict precisely the next character.
In [11]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
print(X.shape)
X = X / float(vocab_size)
y = np_utils.to_categorical(dataY)
In [12]:
print(y.shape)
We can now define our LSTM model. Here we define a single hidden LSTM layer with 256 memory units. The network uses dropout with a probability of 20. The output layer is a Dense layer using the softmax activation function to output a probability prediction for each of the 3000+ characters between 0 and 1.
In [13]:
model = Sequential()
model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam())
The network is slow to train (about 300 seconds per epoch on an Nvidia K520 GPU). Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.
In [14]:
# define the checkpoint
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
In [15]:
model.summary()
In [16]:
model.fit(X, y, nb_epoch=4, batch_size=256, callbacks=callbacks_list)
Out[16]:
In [22]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
# start=-1
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([indices_char[value] for value in pattern]), "\"")
In [ ]:
# generate characters
for i in range(1000):
x = np.reshape(pattern, (1, len(pattern), 1))
x = x / float(n_vocab)
prediction = model.predict(x, verbose=0)
index = np.argmax(prediction)
result = indices_char[index]
seq_in = [indices_char[value] for value in pattern]
sys.stdout.write(result)
pattern.append(index)
pattern = pattern[1:len(pattern)]
print "\nDone."
In [ ]:
In [ ]:
In [ ]:
In [ ]:
stateful=True means that at end of each sequence, don't reset the hidden activations to 0, but leave them as they are. And also make sure that you pass shuffle=False when you train the model.
A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.
When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.
In [64]:
bs=64
In [65]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,cs)),
BatchNormalization(),
LSTM(n_hidden, return_sequences=True, stateful=True),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [66]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.
In [67]:
mx = len(x_rnn)//bs*bs
In [68]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=10, shuffle=False)
Out[68]:
In [69]:
def get_next_keras(inp):
idxs = [char_indices[c] for c in inp]
# np.newaxis is used to add 1 more dimention
arrs = np.array(idxs)[np.newaxis, :]
p = model.predict(arrs)[0]
return chars[np.argmax(p)]
In [73]:
model.predict(x_rnn[-64:])[0]
In [ ]: