In [1]:
KERAS_MODEL_FILEPATH = '../../demos/data/imdb_bidirectional_lstm/imdb_bidirectional_lstm.h5'

In [2]:
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Bidirectional
from keras.datasets import imdb
from keras.callbacks import EarlyStopping, ModelCheckpoint

import json


Using TensorFlow backend.
/home/leon/miniconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [3]:
max_features = 20000
maxlen = 200  # cut texts after this number of words (among top max_features most common words)

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)


Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
17465344/17464789 [==============================] - 9s 1us/step
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
X_train shape: (25000, 200)
X_test shape: (25000, 200)

In [4]:
model = Sequential()
model.add(Embedding(max_features, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [5]:
# Model saving callback
checkpointer = ModelCheckpoint(filepath=KERAS_MODEL_FILEPATH, monitor='val_acc', verbose=1, save_best_only=True)

# Early stopping
early_stopping = EarlyStopping(monitor='val_acc', verbose=1, patience=2)

# train
batch_size = 128
epochs = 10
model.fit(X_train, y_train, 
          validation_data=[X_test, y_test],
          batch_size=batch_size, epochs=epochs, verbose=2,
          callbacks=[checkpointer, early_stopping])


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.86916, saving model to ../../demos/data/imdb_bidirectional_lstm/imdb_bidirectional_lstm.h5
 - 77s - loss: 0.4602 - acc: 0.7830 - val_loss: 0.3180 - val_acc: 0.8692
Epoch 2/10
Epoch 00002: val_acc improved from 0.86916 to 0.87272, saving model to ../../demos/data/imdb_bidirectional_lstm/imdb_bidirectional_lstm.h5
 - 74s - loss: 0.2304 - acc: 0.9169 - val_loss: 0.3164 - val_acc: 0.8727
Epoch 3/10
Epoch 00003: val_acc did not improve
 - 74s - loss: 0.1490 - acc: 0.9520 - val_loss: 0.3412 - val_acc: 0.8637
Epoch 4/10
Epoch 00004: val_acc did not improve
 - 75s - loss: 0.1026 - acc: 0.9694 - val_loss: 0.4146 - val_acc: 0.8626
Epoch 00004: early stopping
Out[5]:
<keras.callbacks.History at 0x7f34a2714ef0>

sample data


In [6]:
word_index = imdb.get_word_index()


Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 1s 0us/step

In [7]:
word_dict = {idx: word for word, idx in word_index.items()}

In [8]:
sample = []
for idx in X_train[0]:
    if idx >= 3:
        sample.append(word_dict[idx-3])
    elif idx == 2:
        sample.append('-')
' '.join(sample)


Out[8]:
"i'll keep it short and brief the people who wrote the story lines for this show are genius the actors are just perfect for the roles they play - character is legendary and they have so much chemistry on screen which makes it what it is a very successful comedy br br when i saw first saw the new episodes which is probably going back just over 6 7 months i wondered what had happened to paul i was gutted to find out that he had died when i - google he was so funny and played his character to perfection an over protective dad who likes to keep his daughters out of the limelight and away from boys br br the comedy i think has gone from strength to strength even without paul in it br br plus i think most people would enjoy this watching it"

In [9]:
with open('../../demos/data/imdb_bidirectional_lstm/imdb_dataset_word_index_top20000.json', 'w') as f:
    f.write(json.dumps({word: idx for word, idx in word_index.items() if idx < max_features}))

In [10]:
with open('../../demos/data/imdb_bidirectional_lstm/imdb_dataset_word_dict_top20000.json', 'w') as f:
    f.write(json.dumps({idx: word for word, idx in word_index.items() if idx < max_features}))

In [11]:
sample_test_data = []
for i in np.random.choice(range(X_test.shape[0]), size=1000, replace=False):
    sample_test_data.append({'values': X_test[i].tolist(), 'label': y_test[i].tolist()})
    
with open('../../demos/data/imdb_bidirectional_lstm/imdb_dataset_test.json', 'w') as f:
    f.write(json.dumps(sample_test_data))

In [ ]: