Train a simple LSTM


In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Concatenate, concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys

from time import time
import gc
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from itertools import chain
from collections import Counter
from gensim.models import word2vec
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle


Using TensorFlow backend.
/opt/conda/lib/python3.5/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [15]:
test_df = pd.read_csv('test_snello_pulito.csv', encoding='latin1')
train_df = pd.read_csv('train_snello_pulito.csv', encoding='latin1')
train_df.fillna("ciccia", inplace=True)
q1, q2 = 'question1_final', 'question2_final'

In [2]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 135
VALIDATION_SPLIT = 0.13

In [ ]:
def build_corpus(data, q1, q2):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in [q1, q2]:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)

    return corpus


corpus = build_corpus(train_df, q1, q2)
corpus.extend(build_corpus(test_df, q1, q2))

In [24]:
model = word2vec.Word2Vec(corpus, size=EMBEDDING_DIM, window=5, min_count=5, workers=4)

In [25]:
model.save('model.bin')

In [13]:
BASE_DIR = ''
EMBEDDING_FILE = BASE_DIR + 'model_best.bin'

In [14]:
word2vec = KeyedVectors.load(EMBEDDING_FILE)

In [ ]:
texts_1 = list(train_df.question1_final.values)
texts_2 = list(train_df.question2_final.values)
test_texts_1 = list(test_df.question1_final.values)
test_texts_2 = list(test_df.question2_final.values)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train_df.is_duplicate.values)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)

In [22]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.wv.vocab:
        embedding_matrix[i] = word2vec.wv[word]

In [32]:
np.savetxt('labels.gz', labels)
np.savetxt('data_1.gz', data_1)
np.savetxt('data_2.gz', data_2)
np.savetxt('embedding_matrix.gz', embedding_matrix)

In [3]:
labels = np.loadtxt('labels.gz')
data_1 = np.loadtxt('data_1.gz')
data_2 = np.loadtxt('data_2.gz')
embedding_matrix = np.loadtxt('embedding_matrix.gz')

In [10]:
np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

In [11]:
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

In [12]:
weight_val = np.ones(len(labels_val))
weight_val *= 0.472001959
weight_val[labels_val==0] = 1.309028344

GRU is related to LSTM as both are utilizing different way if gating information to prevent vanishing gradient problem.

Here are some pin-points about GRU vs LSTM:

  • The GRU unit controls the flow of information like the LSTM unit, but without having to use a memory unit. It just exposes the full hidden content without any control.
  • GRU is relatively new, and from my perspective, the performance is on par with LSTM, but computationally more efficient(less complex structure as pointed out). So we are seeing it being used more and more.

In [ ]:
embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm, return_sequences=True)
gru_layer = GRU(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1 )
x2 = LSTM(num_lstm//2, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)
y2 = LSTM(num_lstm//2, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(y1)

merged = concatenate([x2, y2])
merged = Dropout(rate_drop_dense)(merged)
merged = Activation('sigmoid')(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)


preds = Dense(1, activation='sigmoid')(merged)

class_weight = {0: 1.309028344, 1: 0.472001959}

model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
print(model.summary())
print(STAMP)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(
    bst_model_path, save_best_only=True, save_weights_only=True)

In [41]:
hist = model.fit(
    [data_1_train, data_2_train],
    labels_train,
    validation_data=([data_1_val, data_2_val], labels_val, weight_val),
    epochs=10,
    batch_size=4092,
    shuffle=True,
    class_weight=class_weight,
    callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])


Train on 646864 samples, validate on 161716 samples
Epoch 1/10
646864/646864 [==============================] - 237s - loss: 0.4576 - acc: 0.6584 - val_loss: 0.4331 - val_acc: 0.6382
Epoch 2/10
646864/646864 [==============================] - 235s - loss: 0.3939 - acc: 0.6755 - val_loss: 0.3930 - val_acc: 0.6499
Epoch 3/10
646864/646864 [==============================] - 235s - loss: 0.3805 - acc: 0.6855 - val_loss: 0.5614 - val_acc: 0.6304
Epoch 4/10
646864/646864 [==============================] - 235s - loss: 0.3733 - acc: 0.6908 - val_loss: 0.3663 - val_acc: 0.6906
Epoch 5/10
646864/646864 [==============================] - 235s - loss: 0.3638 - acc: 0.7006 - val_loss: 0.3632 - val_acc: 0.6913
Epoch 6/10
646864/646864 [==============================] - 235s - loss: 0.3595 - acc: 0.7049 - val_loss: 0.4135 - val_acc: 0.7355
Epoch 7/10
646864/646864 [==============================] - 235s - loss: 0.3540 - acc: 0.7109 - val_loss: 0.3456 - val_acc: 0.7189
Epoch 8/10
646864/646864 [==============================] - 235s - loss: 0.3486 - acc: 0.7160 - val_loss: 0.3519 - val_acc: 0.6946
Epoch 9/10
646864/646864 [==============================] - 235s - loss: 0.3444 - acc: 0.7206 - val_loss: 0.3358 - val_acc: 0.7338
Epoch 10/10
646864/646864 [==============================] - 235s - loss: 0.3400 - acc: 0.7250 - val_loss: 0.3419 - val_acc: 0.7371

Load the final weigths of the final LSTM tuned


In [ ]:
num_lstm = 210 # int(np.random.randint(175, 230)) # 175-275
rate_drop_lstm = 0.3 # 0.15 + np.random.rand() * 0.25
num_dense = 135 # np.random.randint(100, 150)
class_weight = {0:1, 1:0.282} #{0: r*rtrain, 1: 1}# {0: 1.309028344, 1: 0.472001959}
firts_lstm={'num_lstm':num_lstm, 'rate_drop_lstm':rate_drop_lstm}
activation='relu'
drop_rate=0.3
dropout = 0.3
recurrent_dropout=0.3
rate_drop_lstm = firts_lstm['rate_drop_lstm']


embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
lstm_layer = LSTM(firts_lstm['num_lstm'], dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm, 
                  return_sequences=True)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
    
x = lstm_layer(embedded_sequences_1)
y = lstm_layer(embedded_sequences_2)
x = LSTM(num_lstm//2, dropout=dropout, recurrent_dropout=recurrent_dropout)(x)
y = LSTM(num_lstm//2, dropout=dropout, recurrent_dropout=recurrent_dropout)(y)

x = Dropout(drop_rate)(x)
y = Dropout(drop_rate)(y)
merged = concatenate([x, y])
merged = Dropout(drop_rate)(merged)
merged = BatchNormalization()(merged)


merged = Dense(num_dense, activation=activation)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(drop_rate)(merged)        
        
merged = Dense(num_dense, activation=activation)(merged)
merged = Dropout(drop_rate)(merged)
merged = BatchNormalization()(merged)    
preds = Dense(1, activation='sigmoid')(merged)
    
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
print(model.summary())

In [13]:
# load weights
model.load_weights("lstm_dense_dropout_rmsprop_True_relu_2048_0.3_210_0.3.h5")
# Compile model (required to make predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Created model and loaded weights from file")


Created model and loaded weights from file

In [53]:
from keras.models import load_model
model = load_model('./lstm_dense_dropout_rmsprop_True_relu_2048_0.3_210_0.3.h5')

In [ ]:
test_ids = test_df['test_id'].values

In [ ]:
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2

In [ ]:
submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('submissionNN.csv'', index=False)

In [ ]: