Quora question pairs: training

Import packages


In [1]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split


Using TensorFlow backend.

Initialize global variables


In [2]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
DROPOUT = 0.1
BATCH_SIZE = 32

Load the dataset, embedding matrix and word count


In [3]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

Partition the dataset into train and test sets


In [4]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

Define the model


In [5]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [6]:
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_1 (InputLayer)             (None, 25)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 25)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 25, 300)       28679100    input_1[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 25, 300)       28679100    input_2[0][0]                    
____________________________________________________________________________________________________
time_distributed_1 (TimeDistribu (None, 25, 300)       90300       embedding_1[0][0]                
____________________________________________________________________________________________________
time_distributed_2 (TimeDistribu (None, 25, 300)       90300       embedding_2[0][0]                
____________________________________________________________________________________________________
lambda_1 (Lambda)                (None, 300)           0           time_distributed_1[0][0]         
____________________________________________________________________________________________________
lambda_2 (Lambda)                (None, 300)           0           time_distributed_2[0][0]         
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 600)           0           lambda_1[0][0]                   
                                                                   lambda_2[0][0]                   
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 200)           120200      concatenate_1[0][0]              
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 200)           0           dense_3[0][0]                    
____________________________________________________________________________________________________
batch_normalization_1 (BatchNorm (None, 200)           800         dropout_1[0][0]                  
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 200)           40200       batch_normalization_1[0][0]      
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 200)           0           dense_4[0][0]                    
____________________________________________________________________________________________________
batch_normalization_2 (BatchNorm (None, 200)           800         dropout_2[0][0]                  
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 200)           40200       batch_normalization_2[0][0]      
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 200)           0           dense_5[0][0]                    
____________________________________________________________________________________________________
batch_normalization_3 (BatchNorm (None, 200)           800         dropout_3[0][0]                  
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 200)           40200       batch_normalization_3[0][0]      
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 200)           0           dense_6[0][0]                    
____________________________________________________________________________________________________
batch_normalization_4 (BatchNorm (None, 200)           800         dropout_4[0][0]                  
____________________________________________________________________________________________________
dense_7 (Dense)                  (None, 1)             201         batch_normalization_4[0][0]      
====================================================================================================
Total params: 57,783,001
Trainable params: 423,201
Non-trainable params: 57,359,800
____________________________________________________________________________________________________

Train the model, checkpointing weights with best validation accuracy


In [7]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))


Starting training at 2017-06-01 19:36:54.250699
Train on 327474 samples, validate on 36387 samples
Epoch 1/25
142s - loss: 0.5421 - acc: 0.7264 - val_loss: 0.4948 - val_acc: 0.7613
Epoch 2/25
142s - loss: 0.4894 - acc: 0.7606 - val_loss: 0.4681 - val_acc: 0.7718
Epoch 3/25
141s - loss: 0.4564 - acc: 0.7797 - val_loss: 0.4600 - val_acc: 0.7798
Epoch 4/25
141s - loss: 0.4345 - acc: 0.7936 - val_loss: 0.4401 - val_acc: 0.7857
Epoch 5/25
141s - loss: 0.4170 - acc: 0.8058 - val_loss: 0.4334 - val_acc: 0.7945
Epoch 6/25
142s - loss: 0.4012 - acc: 0.8154 - val_loss: 0.4218 - val_acc: 0.7974
Epoch 7/25
141s - loss: 0.3862 - acc: 0.8243 - val_loss: 0.4179 - val_acc: 0.8037
Epoch 8/25
143s - loss: 0.3802 - acc: 0.8279 - val_loss: 0.4135 - val_acc: 0.8038
Epoch 9/25
142s - loss: 0.3671 - acc: 0.8357 - val_loss: 0.4099 - val_acc: 0.8046
Epoch 10/25
141s - loss: 0.3528 - acc: 0.8431 - val_loss: 0.4145 - val_acc: 0.8005
Epoch 11/25
140s - loss: 0.3431 - acc: 0.8484 - val_loss: 0.4152 - val_acc: 0.8011
Epoch 12/25
141s - loss: 0.3334 - acc: 0.8539 - val_loss: 0.4197 - val_acc: 0.7997
Epoch 13/25
139s - loss: 0.3267 - acc: 0.8583 - val_loss: 0.4078 - val_acc: 0.8069
Epoch 14/25
137s - loss: 0.3181 - acc: 0.8623 - val_loss: 0.4204 - val_acc: 0.8005
Epoch 15/25
137s - loss: 0.3132 - acc: 0.8646 - val_loss: 0.4248 - val_acc: 0.7983
Epoch 16/25
139s - loss: 0.3062 - acc: 0.8688 - val_loss: 0.4139 - val_acc: 0.8083
Epoch 17/25
138s - loss: 0.3028 - acc: 0.8706 - val_loss: 0.4086 - val_acc: 0.8066
Epoch 18/25
137s - loss: 0.2997 - acc: 0.8714 - val_loss: 0.4211 - val_acc: 0.8080
Epoch 19/25
138s - loss: 0.2948 - acc: 0.8740 - val_loss: 0.4561 - val_acc: 0.7823
Epoch 20/25
138s - loss: 0.2940 - acc: 0.8745 - val_loss: 0.4214 - val_acc: 0.8064
Epoch 21/25
138s - loss: 0.2983 - acc: 0.8727 - val_loss: 0.4150 - val_acc: 0.8078
Epoch 22/25
137s - loss: 0.2963 - acc: 0.8741 - val_loss: 0.4318 - val_acc: 0.7995
Epoch 23/25
138s - loss: 0.2865 - acc: 0.8785 - val_loss: 0.4405 - val_acc: 0.7981
Epoch 24/25
137s - loss: 0.2826 - acc: 0.8810 - val_loss: 0.4314 - val_acc: 0.8067
Epoch 25/25
136s - loss: 0.2755 - acc: 0.8840 - val_loss: 0.4200 - val_acc: 0.8042
Training ended at 2017-06-01 20:35:11.907055
Minutes elapsed: 58.294262

Plot training and validation accuracy


In [8]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['acc'],
                    'validation': history.history['val_acc']})
ax = acc.iloc[:,:].plot(x='epoch', figsize={5,8}, grid=True)
ax.set_ylabel("accuracy")
ax.set_ylim([0.0,1.0]);



In [9]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))


Maximum accuracy at epoch 16 = 0.8083

Evaluate the model with best validation accuracy on the test partition


In [10]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))


loss = 0.4102, accuracy = 0.8110