Text Classification using Bidirectional LSTMs.


In [1]:
import pandas as pd
import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Getting the data

In this section, we load the IMDB dataset using Keras' built-in dataset functionality.


In [3]:
from keras.datasets import imdb

max_features = 20000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
17465344/17464789 [==============================] - 21s 1us/step
25000 train sequences
25000 test sequences

Preprocessing and Padding


In [5]:
# cut texts after this number of words (among top max_features most common words)
from keras.preprocessing import sequence

maxlen = 80 
batch_size = 32
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)

Model Construction


In [22]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, LSTM, TimeDistributed

In [27]:
embedding_size = 150
n_lstm_units = 100

model = Sequential()
model.add(Embedding(max_features+1, embedding_size, mask_zero=True))
model.add(
    Bidirectional(LSTM(n_lstm_units, dropout=0.2, recurrent_dropout=0.2))
)

model.add(Dense(1, activation='sigmoid'))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_10 (Embedding)     (None, None, 150)         3000150   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 200)               200800    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
=================================================================
Total params: 3,201,151
Trainable params: 3,201,151
Non-trainable params: 0
_________________________________________________________________

Train Step


In [28]:
# try using different optimizers and different optimizer configs

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_split=0.2)


Train on 20000 samples, validate on 5000 samples
Epoch 1/4
20000/20000 [==============================] - 300s 15ms/step - loss: 0.4501 - acc: 0.7883 - val_loss: 0.3640 - val_acc: 0.8366
Epoch 2/4
20000/20000 [==============================] - 276s 14ms/step - loss: 0.2613 - acc: 0.8955 - val_loss: 0.3749 - val_acc: 0.8438
Epoch 3/4
20000/20000 [==============================] - 296s 15ms/step - loss: 0.1645 - acc: 0.9382 - val_loss: 0.4213 - val_acc: 0.8270
Epoch 4/4
20000/20000 [==============================] - 287s 14ms/step - loss: 0.0901 - acc: 0.9683 - val_loss: 0.5708 - val_acc: 0.8198

In [ ]: