In [1]:
%pylab inline
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import re
from keras.datasets import imdb, reuters
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
Load the MNIST dataset, flatten the images, convert the class labels, and scale the data.
In [2]:
path = "../../../class_data/aclImdb/"
ff = [path + "train/pos/" + x for x in os.listdir(path + "train/pos")] + \
[path + "train/neg/" + x for x in os.listdir(path + "train/neg")] + \
[path + "test/pos/" + x for x in os.listdir(path + "test/pos")] + \
[path + "test/neg/" + x for x in os.listdir(path + "test/neg")]
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
input_label = ([1] * 12500 + [0] * 12500) * 2
input_text = []
for f in ff:
with open(f) as fin:
pass
input_text += [remove_tags(" ".join(fin.readlines()))]
Next, we construct a tokenizer object, initialized with the number of total terms we want. I then use the training data to find the top most used words.
In [3]:
num_words = 2000
tok = Tokenizer(num_words)
tok.fit_on_texts(input_text[:25000])
The tokenizer makes getting the words themeselves out oddly difficult, but this will do it for us:
In [4]:
words = []
for iter in range(num_words):
words += [key for key,value in tok.word_index.items() if value==iter+1]
words[:10]
Out[4]:
We can now use the tokenizer to construct data matricies that look like the ones pre-supplied by keras.
In [5]:
X_train = tok.texts_to_sequences(input_text[:25000])
X_test = tok.texts_to_sequences(input_text[25000:])
y_train = input_label[:25000]
y_test = input_label[25000:]
X_train = sequence.pad_sequences(X_train, maxlen=100)
X_test = sequence.pad_sequences(X_test, maxlen=100)
To reconstruct the text, which will have any words not in our vocabulary removed, we can use this function:
In [6]:
def reconstruct_text(index, words):
text = []
for ind in index:
if ind != 0:
text += [words[ind-1]]
else:
text += [""]
return text
print(input_text[100])
print("\n\n")
print(reconstruct_text(X_train[100][:40], words))
Notice that much of the original context is gone given our aggressive filtering, but the main tone (in this case, at least) remains. We would probably want to filter out a few things, like numbers, if we were being more careful.
In [8]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=100))
model.add(Dropout(0.25))
model.add(SimpleRNN(16, return_sequences=False))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
I think it is incredibly important to make sure the shape of the weights and bias make sense to you. If they do, you probably understand a large part of what is going on.
In [9]:
print(model.layers[2].get_weights()[0].shape) # W - input weights
print(model.layers[2].get_weights()[1].shape) # U - recurrent weights
print(model.layers[2].get_weights()[2].shape) # b - bias
Fitting the model works exactly the same as with CNNs or dense neural networks.
In [10]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=10, verbose=1,
validation_data=(X_test, y_test))
Out[10]:
In [11]:
model = Sequential()
model.add(Embedding(num_words, 50))
model.add(Dropout(0.25))
model.add(LSTM(32))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
The weights in the LSTM layer are quite a bit more complex, with four triples of W, U, and b. All three have the same dimension, however.
In [12]:
print(model.layers[2].get_weights()[0].shape) # W_i input gate weights
print(model.layers[2].get_weights()[1].shape) # U_i
print(model.layers[2].get_weights()[2].shape) # b_i
print(model.layers[2].get_weights()[3].shape) # W_f forget weights
print(model.layers[2].get_weights()[4].shape) # U_f
print(model.layers[2].get_weights()[5].shape) # b_f
print(model.layers[2].get_weights()[6].shape) # W_c cell weights
print(model.layers[2].get_weights()[7].shape) # U_c
print(model.layers[2].get_weights()[8].shape) # b_c
print(model.layers[2].get_weights()[9].shape) # W_o output weights
print(model.layers[2].get_weights()[10].shape) # U_o
print(model.layers[2].get_weights()[11].shape) # b_o
We'll train the model the same as with the SimpleRNN, but the computational time will be significantly higher. The algorithm needs to backpropagate the complex mechanism inside of the LSTM unit through the entire time series, so this does not seem too surprising.
In [13]:
model.fit(X_train, y_train, batch_size=1, nb_epoch=10, verbose=1,
validation_data=(X_test, y_test))
Out[13]:
In [14]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=100))
model.add(Dropout(0.25))
model.add(GRU(32,activation='relu'))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
GRU's have one fewer sets of weights (W,U,b).
In [15]:
print(model.layers[2].get_weights()[0].shape) # W_z update weights
print(model.layers[2].get_weights()[1].shape) # U_z
print(model.layers[2].get_weights()[2].shape) # b_z
print(model.layers[2].get_weights()[3].shape) # W_r reset weights
print(model.layers[2].get_weights()[4].shape) # U_r
print(model.layers[2].get_weights()[5].shape) # b_r
print(model.layers[2].get_weights()[6].shape) # W_h output weights
print(model.layers[2].get_weights()[7].shape) # U_h
print(model.layers[2].get_weights()[8].shape) # b_h
In [16]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=20, verbose=1,
validation_data=(X_test, y_test))
Out[16]:
In [17]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=100))
model.add(Dropout(0.25))
model.add(SimpleRNN(16, return_sequences=False))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [18]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=5, verbose=1,
validation_data=(X_test, y_test))
Out[18]:
Now that we've trained on the final output, we want to take the same weights as before, but to make SimpleRNN return the entire sequence. The output layer will then return a result of size 100, rather than size 1; this is the result of the algorithm after seeing just the first k terms. The last value will be the same as using model
.
To do this, as far as I can tell, one needs to create a new model from scratch and then load the weights from the old model. We have to wrap any layers with learnable weights above the SimpleRNN in the wrapper TimeDistributed
, so that it knows to apply the weights seperately to the time-components from the prior level.
In [19]:
model2 = Sequential()
model2.add(Embedding(num_words, 32, input_length=100))
model2.add(Dropout(0.25))
model2.add(SimpleRNN(16, return_sequences=True))
model2.add(TimeDistributed(Dense(256)))
model2.add(Dropout(0.25))
model2.add(Activation('relu'))
model2.add(TimeDistributed(Dense(1)))
model2.add(Activation('sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [20]:
model2.set_weights(model.get_weights())
Notice that the dimensions of the weights are exactly the same; the input sizes are larger, but with weight sharing we can use the same weight matricies. This is akin to the OverFeat paper for CNNs where a convolution is applied to a larger image; the output's dimensions just increase.
Let's now predict the sequence of values for the entire training set.
In [21]:
y_hat2 = model2.predict(X_train)
y_hat2.shape
Out[21]:
In [22]:
ind = 100
tokens = reconstruct_text(X_train[ind], words)
print(input_text[ind])
plt.figure(figsize=(16, 10))
plt.plot(y_hat2[iter],alpha=0.5)
for i in range(len(tokens)):
plt.text(i,0.5,tokens[i],rotation=90)
In [23]:
ind = 22000
tokens = reconstruct_text(X_train[ind], words)
print(input_text[ind])
plt.figure(figsize=(16, 10))
plt.plot(y_hat2[ind],alpha=0.5)
for i in range(len(tokens)):
plt.text(i,0.5,tokens[i],rotation=90)
In [24]:
ind = 10000
tokens = reconstruct_text(X_train[ind], words)
print(input_text[ind])
plt.figure(figsize=(16, 10))
plt.plot(y_hat2[ind],alpha=0.5)
for i in range(len(tokens)):
plt.text(i,0.5,tokens[i],rotation=90)