In [21]:
from theano.sandbox import cuda
In [22]:
%matplotlib inline
import utils
from utils import *
In [23]:
model_path = 'data/imdb/models/'
%mkdir -p $model_path
We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset.
In [24]:
from keras.datasets import imdb
idx = imdb.get_word_index()
type(idx)
Out[24]:
In [25]:
# Let's look at the word list
"""
sorted(iterable, *, key=None, reverse=False):
built-in function; Return a new sorted list from the items in iterable.
"""
idx_list = sorted(idx, key=idx.get)
print(idx_list[:5])
from itertools import islice
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
print(take(5, idx.items()))
Create a mapping dict from id to word
In [26]:
idx2word = {v:k for k, v in idx.items()}
Get the reviews file
In [27]:
path = get_file('imdb_full.pkl',
origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
md5_hash='d091312047c43cf9e4e38fef92437263')
"""
get_file(fname, origin,...):
keras function; downloads a file from a URL if it not already in the cache.
"""
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)
In [28]:
print(type(x_train))
print(len(x_train))
# print the 1st review
', '.join(map(str, x_train[0]))
Out[28]:
In [29]:
# Let's map the idx to words
' '.join(idx2word[o] for o in x_train[0])
Out[29]:
The labels are 1 for positive, 0 for negative
In [30]:
labels_train[:10]
Out[30]:
Reduce vocab size by setting rare words to max index
In [31]:
vocab_size = 5000
trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]
Let's look at the distribution of the sentences length
In [32]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())
Out[32]:
Pad or truncate each sentence to make consistent length of 500
In [33]:
seq_len = 500
"""
keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32',
padding='pre', truncating='pre', value=0.)
Transform a list of num_samples sequences (lists of scalars) into a 2D Numpy array of shape
(num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided,
or the length of the longest sequence otherwise. Sequences that are shorter than
num_timesteps are padded with value at the end. Sequences longer than num_timesteps are
truncated so that it fits the desired length. Position where padding or truncation happens
is determined by padding or truncating, respectively.
"""
trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)
trn.shape
Out[33]:
The simplest model that tends to give reasonable results is a single hidden layer net. Note that we can't expect to get any useful results by feeding word ids directly into a neural net - so intead we use an embedding to replace them with a vector of 32 floating numbers for each word in the vocab
Note here that the final sigmoid function is the same as softmax becuase out output is binary. Whenver we use 'binary_crossentryop', we use 'sigmoid' as activation
In [34]:
model = Sequential([
Embedding(vocab_size, 32, input_length=seq_len),
Flatten(),
Dense(100, activation='relu'),
Dropout(0.7),
Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()
In [35]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)
Out[35]:
A CNN is likely to work better, since it's designed to take advantage of ordered data. We'll need to use a 1D CNN since a sequence of word is 1D
In [36]:
conv1 = Sequential([
Embedding(vocab_size, 32, input_length=seq_len, dropout=0.2),
Dropout(0.2),
# look at 5 words at a time
Convolution1D(64, 5, border_mode='same', activation='relu'),
Dropout(0.2),
MaxPooling1D(),
Flatten(),
Dense(100, activation='relu'),
Dropout(0.7),
Dense(1, activation='sigmoid')
])
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
conv1.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)
Out[36]:
In [37]:
conv1.summary()
$10304 = 5*32*64 + 64$
Each filter is a 5x32 matrix
In [38]:
conv1.save_weights(model_path + 'conv1.h5')
In [39]:
conv1.load_weights(model_path + 'conv1.h5')
You may want to look at wordvectors.ipynb before moving on. In this section, we replicate the previous CNN, but using pre-trained embeddings. You should always use pre-trained vectors
In [40]:
def get_glove_dataset(dataset):
"""Download the requested glove dataset from files.fast.ai
and return a location that can be passed to load_vectors.
"""
# see wordvectors.ipynb for info on how these files were
# generated from the original glove data.
md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
'6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
'6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
'6B.300d': '30290210376887dcc6d0a5a6374d8255'}
glove_path = os.path.abspath('data/glove/results')
%mkdir -p $glove_path
return get_file(dataset,
'http://files.fast.ai/models/glove/' + dataset + '.tgz',
cache_subdir=glove_path,
md5_hash=md5sums.get(dataset, None),
untar=True)
In [41]:
def load_vectors(loc):
return (load_array(loc+'.dat'),
pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))
In [42]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))
The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).
In [43]:
def create_emb(vecs, vocab_size):
n_fact = vecs.shape[1]
emb = np.zeros((vocab_size, n_fact))
for i in range(1, len(emb)):
word = idx2word[i]
if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
src_idx = wordidx[word]
emb[i] = vecs[src_idx]
else:
# If we can't find the word in glove, randomly initialize
emb[i] = normal(scale=0.6, size=(n_fact,))
# This is our "rare word" id - we want to randomly initialize
emb[-1] = normal(scale=0.6, size=(n_fact,))
emb/=3
return emb
In [44]:
emb = create_emb(vecs, vocab_size)
We pass our embedding matrix to the Embedding constructor, and set it to non-trainable.
In [47]:
model = Sequential([
Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2,
weights=[emb]),
Dropout(0.25),
Convolution1D(64, 5, border_mode='same', activation='relu'),
Dropout(0.25),
MaxPooling1D(),
Flatten(),
Dense(100, activation='relu'),
Dropout(0.7),
Dense(1, activation='sigmoid')])
model.layers[1].trainable=False
In [48]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)
Out[48]:
We already have beaten our previous model! But let's fine-tune the embedding weights - especially since the words we couldn't find in glove just have random embeddings
In [49]:
model.layers[0].trainable = True
In [51]:
model.optimizer.lr = 1e-4
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)
Out[51]:
In [52]:
model.save_weights(model_path+'glove50.h5')
In [53]:
from keras.layers import Merge
How can we further improve?
Well, let's try not just using one size of convolution, but a few sizes of convolution layers.
We use the functional API to create multiple conv layer of different sizes, and then concatenate them
In [55]:
graph_in = Input((vocab_size, 50))
convs = []
for fsz in range(3, 6):
x = Convolution1D(64, fsz, border_mode='same', activation='relu')(graph_in)
x = MaxPooling1D()(x)
x = Flatten()(x)
convs.append(x)
out = Merge(mode='concat')(convs)
graph = Model(graph_in, out)
In [57]:
emb = create_emb(vecs, vocab_size)
We then replace the conv/max-pool layer in our original CNN with the concatenated conv layers
In [61]:
model = Sequential([
Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, weights=[emb]),
Dropout (0.2),
graph,
Dropout (0.5),
Dense (100, activation="relu"),
Dropout (0.7),
Dense (1, activation='sigmoid')
])
model.layers[1].trainable=False
In [62]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)
Out[62]:
In [65]:
model.layers[0].trainable=True
model.optimizer.lr=1e-5
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)
Out[65]: