In [0]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.2-understanding-recurrent-neural-networks.ipynb

In [0]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%pylab inline
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras


Populating the interactive namespace from numpy and matplotlib

In [0]:
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 10000  # number of words to consider as features
maxlen = 500  # cut texts after this number of words (among top max_features most common words)

# each review is encoded as a sequence of word indexes
# indexed by overall frequency in the dataset
# output is 0 (negative) or 1 (positive) 
imdb = tf.keras.datasets.imdb.load_data(num_words=max_features)
(raw_input_train, y_train), (raw_input_test, y_test) = imdb

# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

input_train = tf.keras.preprocessing.sequence.pad_sequences(raw_input_train, maxlen=maxlen)
input_test = tf.keras.preprocessing.sequence.pad_sequences(raw_input_test, maxlen=maxlen)

GRU RNNs


In [0]:
# Batch Normalization:
# https://towardsdatascience.com/batch-normalization-in-neural-networks-1ac91516821c
# https://www.quora.com/Why-does-batch-normalization-help

from tensorflow.keras.layers import GRU, Embedding, Bidirectional, BatchNormalization, Dropout

embedding_dim = 32

dropout = 0.6
recurrent_dropout = 0.4

model = keras.Sequential()

# encoder
model.add(Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))

# https://arxiv.org/ftp/arxiv/papers/1701/1701.05923.pdf
# n = output dimension
# m = input dimension
# Total number of parameters for 
# RNN = n**2 + nm + n
# GRU = 3 × (n**2 + nm + n)
# LSTM = 4 × (n**2 + nm + n)

# return_sequences passes all outputs of all timesteps (not only the last one) to the next layer
model.add(GRU(name='gru1', units=32, dropout=dropout, recurrent_dropout=recurrent_dropout, return_sequences=True))

# for embedding: 32*2 (“standard deviation” parameter (gamma), “mean” parameter (beta)) trainable parameters
# and 32*2 (moving_mean and moving_variance) non-trainable parameters
model.add(BatchNormalization())  
model.add(Dropout(dropout))

# stack recurrent layers like with fc
model.add(GRU(name='gru2', units=32))
model.add(BatchNormalization())
model.add(Dropout(dropout))

# latent space
model.add(tf.keras.layers.Dense(name='fc', units=32, activation='relu'))

# binary classifier as decoder
model.add(tf.keras.layers.Dense(name='classifier', units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 500, 32)           320000    
_________________________________________________________________
gru1 (GRU)                   (None, 500, 32)           6240      
_________________________________________________________________
batch_normalization (BatchNo (None, 500, 32)           128       
_________________________________________________________________
dropout (Dropout)            (None, 500, 32)           0         
_________________________________________________________________
gru2 (GRU)                   (None, 32)                6240      
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
fc (Dense)                   (None, 32)                1056      
_________________________________________________________________
classifier (Dense)           (None, 1)                 33        
=================================================================
Total params: 333,825
Trainable params: 333,697
Non-trainable params: 128
_________________________________________________________________

In [0]:
batch_size = 1000

%time history = model.fit(input_train, y_train, epochs=15, batch_size=batch_size, validation_split=0.2)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 57s 3ms/step - loss: 0.4443 - acc: 0.8048 - val_loss: 0.5092 - val_acc: 0.7696
Epoch 2/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.3962 - acc: 0.8308 - val_loss: 0.4648 - val_acc: 0.7920
Epoch 3/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.3513 - acc: 0.8560 - val_loss: 0.3721 - val_acc: 0.8462
Epoch 4/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.3166 - acc: 0.8725 - val_loss: 0.3817 - val_acc: 0.8326
Epoch 5/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.2923 - acc: 0.8846 - val_loss: 0.3642 - val_acc: 0.8456
Epoch 6/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.2558 - acc: 0.9008 - val_loss: 0.4262 - val_acc: 0.8128
Epoch 7/10
20000/20000 [==============================] - 57s 3ms/step - loss: 0.2364 - acc: 0.9083 - val_loss: 0.3648 - val_acc: 0.8496
Epoch 8/10
20000/20000 [==============================] - 58s 3ms/step - loss: 0.2287 - acc: 0.9134 - val_loss: 0.4112 - val_acc: 0.8312
Epoch 9/10
20000/20000 [==============================] - 57s 3ms/step - loss: 0.2107 - acc: 0.9214 - val_loss: 0.4497 - val_acc: 0.8238
Epoch 10/10
20000/20000 [==============================] - 57s 3ms/step - loss: 0.1925 - acc: 0.9274 - val_loss: 0.5470 - val_acc: 0.8026
CPU times: user 12min 19s, sys: 1min 34s, total: 13min 54s
Wall time: 9min 34s

In [0]:
train_loss, train_accuracy = model.evaluate(input_train, y_train, batch_size=batch_size)
train_accuracy


25000/25000 [==============================] - 25s 1ms/step
Out[0]:
0.7904000020027161

In [0]:
test_loss, test_accuracy = model.evaluate(input_test, y_test, batch_size=batch_size)
test_accuracy


25000/25000 [==============================] - 26s 1ms/step
Out[0]:
0.7442799997329712

In [0]:
def plot_history(history, samples=10, init_phase_samples=None):
    epochs = history.params['epochs']
    
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    every_sample =  int(epochs / samples)
    acc = pd.DataFrame(acc).iloc[::every_sample, :]
    val_acc = pd.DataFrame(val_acc).iloc[::every_sample, :]

    fig, ax = plt.subplots(figsize=(20,5))

    ax.plot(acc, 'bo', label='Training acc')
    ax.plot(val_acc, 'b', label='Validation acc')
    ax.set_title('Training and validation accuracy')
    ax.legend()

plot_history(history)



In [0]:
# precition
model.predict(input_test[0:5])


Out[0]:
array([[0.5241935],
       [0.5922162],
       [0.5541962],
       [0.4392001],
       [0.6804852]], dtype=float32)

In [0]:
# ground truth
y_test[0:5]


Out[0]:
array([0, 1, 1, 0, 1])

How does this work on anything that is not a real movie review?


In [0]:
word_to_id = keras.datasets.imdb.get_word_index()

def encode_text(text):
    input_words = text.lower().split()
    input_tokens = np.array([word_to_id[word] for word in input_words])
    padded_input_tokens = keras.preprocessing.sequence.pad_sequences([input_tokens], maxlen=maxlen)
    return padded_input_tokens

def predict_text(model, text):
    input_sequence = encode_text(text)
    embeddings = model.predict(input_sequence)
    return embeddings

In [0]:
predict_text(model, "don't watch this movie")


Out[0]:
array([[0.5275438]], dtype=float32)

In [0]:
predict_text(model, "lovely")


Out[0]:
array([[0.68455935]], dtype=float32)

In [0]:
predict_text(model, "pathetic shit")


Out[0]:
array([[0.7576071]], dtype=float32)

In [0]:
predict_text(model, "this is not a shit movie")


Out[0]:
array([[0.5005187]], dtype=float32)

In [0]:
predict_text(model, "such a bad movie")


Out[0]:
array([[0.32621145]], dtype=float32)

In [0]: