Baseline
MAX_LEN = 40
EMBEDDING_DIM = 300
BATCH_SIZE = 128
VALID_SPLIT = 0.05
RE_WEIGHT = True
LSTM(256)
parameters of LSTM: 570368
parameters of Dense: 513
weights.001-0.3333.hdf5
475s - loss: 0.3051 - acc: 0.7766 - val_loss: 0.3333 - val_acc: 0.7743
LSTM(128)
parameters of LSTM: 219648
parameters of Dense: 257
weights.001-0.3358.hdf5
389s - loss: 0.3218 - acc: 0.7598 - val_loss: 0.3358 - val_acc: 0.7611
LSTM(64)
parameters of LSTM: 93440
parameters of Dense: 129
weights.002-0.3393.hdf5
356s - loss: 0.3138 - acc: 0.7675 - val_loss: 0.3393 - val_acc: 0.7641
LSTM(64)*2
parameters of LSTM: 126464 = 93440+33024
parameters of Dense: 129
weights.003-0.3385.hdf5
400s - loss: 0.2892 - acc: 0.7953 - val_loss: 0.3385 - val_acc: 0.7695
LSTM(64)*2
BATCH_SIZE: 256 ==> 128
parameters of LSTM: 126464 = 93440+33024
parameters of Dense: 129
weights.002-0.3367.hdf5
767s - loss: 0.2986 - acc: 0.7842 - val_loss: 0.3367 - val_acc: 0.7680
LSTM(64)*3
parameters of LSTM: 159,488 = 93440+33024+33024
parameters of Dense: 129
weights.002-0.3395.hdf5
604s - loss: 0.3081 - acc: 0.7750 - val_loss: 0.3395 - val_acc: 0.7612
BIDIRECT(LSTM(64))
parameters of LSTM: 186,880
parameters of Dense: 257
weights.006-0.3417.hdf5
193s - loss: 0.3007 - acc: 0.7826 - val_loss: 0.3417 - val_acc: 0.7639
LSTM(64)
BatchNormalization()
parameters of LSTM: 93440
parameters of Dense: 129
weights.003-0.3443.hdf5
205s - loss: 0.2958 - acc: 0.7875 - val_loss: 0.3443 - val_acc: 0.7702
LSTM(64,dropout=0.5)
Dropout(0.5)
parameters of LSTM: 93440
parameters of Dense: 129
weights.004-0.3344.hdf5
198s - loss: 0.3097 - acc: 0.7715 - val_loss: 0.3344 - val_acc: 0.7508
LSTM(64)
DENSE(128)
parameters of LSTM: 93440
parameters of Dense: 8256+129
weights.002-0.2749.hdf5
199s - loss: 0.2354 - acc: 0.8314 - val_loss: 0.2749 - val_acc: 0.8093
LSTM(64)
DENSE(64)
parameters of LSTM: 93440
parameters of Dense: 8256+65
weights.002-0.2769.hdf5
196s - loss: 0.2383 - acc: 0.8285 - val_loss: 0.2769 - val_acc: 0.8106
LSTM(64)
DENSE(64)
Dropout(0.5)
weights.008-0.3096.hdf5
200s - loss: 0.2776 - acc: 0.7945 - val_loss: 0.3096 - val_acc: 0.7730
LSTM(64,dropout=0.5)
DENSE(64)
weights.002-0.2788.hdf5
197s - loss: 0.2391 - acc: 0.8271 - val_loss: 0.2788 - val_acc: 0.8087
LSTM(64)
DENSE(64)*2
parameters of LSTM: 93440
parameters of Dense: 8256+4160+65
weights.002-0.2733.hdf5
198s - loss: 0.2340 - acc: 0.8354 - val_loss: 0.2733 - val_acc: 0.8190
LSTM(64)
DENSE(64)*3
parameters of LSTM: 93440
parameters of Dense: 8256+4160+4160+65
weights.002-0.2723.hdf5
203s - loss: 0.2364 - acc: 0.8329 - val_loss: 0.2723 - val_acc: 0.8171
LSTM(64)
DENSE(64)
lr 1e-3 ==>1e-2
weights.005-0.2899.hdf5
198s - loss: 0.2458 - acc: 0.8240 - val_loss: 0.2899 - val_acc: 0.7940
LSTM(64)
DENSE(64)
lr 1e-3 ==>1e-4
weights.021-0.2894.hdf5
194s - loss: 0.2383 - acc: 0.8289 - val_loss: 0.2894 - val_acc: 0.7917
LSTM(128)*3
DENSE(128)*3
weights.001-0.2713.hdf5
709s - loss: 0.2539 - acc: 0.8190 - val_loss: 0.2713 - val_acc: 0.8094
LSTM(64)
DENSE(64)
EMBEDDING_TRAINABLE = True
weights.000-0.2829.hdf5
345s - loss: 0.2980 - acc: 0.7705 - val_loss: 0.2829 - val_acc: 0.8035
Using TensorFlow backend.
# Check for any null values
# inds = pd.isnull(trainval_df).any(1).nonzero()[0]
# trainval_df.loc[inds]
# inds = pd.isnull(test_df).any(1).nonzero()[0]
# test_df.loc[inds]
# # Add the string 'empty' to empty strings
# trainval_df = trainval_df.fillna('empty')
# test_df = test_df.fillna('empty')
trainval_df['len1'] = trainval_df.apply(lambda row: len(row['question1_WL'].split()), axis=1)
trainval_df['len2'] = trainval_df.apply(lambda row: len(row['question2_WL'].split()), axis=1)
test_df['len1'] = test_df.apply(lambda row: len(row['question1_WL'].split()), axis=1)
test_df['len2'] = test_df.apply(lambda row: len(row['question2_WL'].split()), axis=1)
lengths = pd.concat([trainval_df['len1'],trainval_df['len2']], axis=0)
print(lengths.describe())
print(np.percentile(lengths, 99.0))
print(np.percentile(lengths, 99.4))
print(np.percentile(lengths, 99.5))
print(np.percentile(lengths, 99.9))
Loading from file trainval_df.pickle
Loading from file test_df.pickle
Loading from file glove.840B.300d.word_embedding_matrix.pickle
word_counts = tokenizer.word_counts
null_embedding_word_counts = { word: word_counts[word] for word in null_embedding_words }
print(sum(null_embedding_word_counts.values())) #454210
word_docs = tokenizer.word_docs
null_embedding_word_docs = { word: word_docs[word] for word in null_embedding_words }
print(sum(null_embedding_word_docs.values())) #446584
# 446584/(404290+2345796)/2 = 0.08119
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_2 (Embedding) (None, 40, 300) 37906800
_________________________________________________________________
lstm_4 (LSTM) (None, 64) 93440
=================================================================
Total params: 38,000,240.0
Trainable params: 38,000,240
Non-trainable params: 0.0
_________________________________________________________________
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
q1 (InputLayer) (None, 40) 0
____________________________________________________________________________________________________
q2 (InputLayer) (None, 40) 0
____________________________________________________________________________________________________
sequential_2 (Sequential) (None, 64) 38000240
____________________________________________________________________________________________________
concatenate_2 (Concatenate) (None, 128) 0
____________________________________________________________________________________________________
dropout_5 (Dropout) (None, 128) 0
____________________________________________________________________________________________________
dense_5 (Dense) (None, 64) 8256
____________________________________________________________________________________________________
dropout_6 (Dropout) (None, 64) 0
____________________________________________________________________________________________________
dense_6 (Dense) (None, 1) 65
====================================================================================================
Total params: 38,008,561.0
Trainable params: 38,008,561.0
Non-trainable params: 0.0
____________________________________________________________________________________________________
BATCH_SIZE: 256
Train on 770048 samples, validate on 38532 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.28294, saving model to ./checkpoint/weights.000-0.2829.hdf5
345s - loss: 0.2980 - acc: 0.7705 - val_loss: 0.2829 - val_acc: 0.8035
Epoch 2/100
Epoch 00001: val_loss did not improve
341s - loss: 0.1793 - acc: 0.8807 - val_loss: 0.3162 - val_acc: 0.8340
Epoch 3/100
Epoch 00002: val_loss did not improve
341s - loss: 0.1099 - acc: 0.9337 - val_loss: 0.3861 - val_acc: 0.8396
Epoch 4/100
Epoch 00003: val_loss did not improve
340s - loss: 0.0677 - acc: 0.9616 - val_loss: 0.4722 - val_acc: 0.8408
Epoch 5/100
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-16-c844293a0ea5> in <module>()
11 batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks,
12 validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights),
---> 13 shuffle=True, class_weight=class_weight, initial_epoch=0)
/usr/local/lib64/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
1483 val_f=val_f, val_ins=val_ins, shuffle=shuffle,
1484 callback_metrics=callback_metrics,
-> 1485 initial_epoch=initial_epoch)
1486
1487 def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
/usr/local/lib64/python2.7/site-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
1138 batch_logs['size'] = len(batch_ids)
1139 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1140 outs = f(ins_batch)
1141 if not isinstance(outs, list):
1142 outs = [outs]
/usr/local/lib64/python2.7/site-packages/keras/backend/tensorflow_backend.pyc in __call__(self, inputs)
2071 session = get_session()
2072 updated = session.run(self.outputs + [self.updates_op],
-> 2073 feed_dict=feed_dict)
2074 return updated[:len(self.outputs)]
2075
/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
765 try:
766 result = self._run(None, fetches, feed_dict, options_ptr,
--> 767 run_metadata_ptr)
768 if run_metadata:
769 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
963 if final_fetches or final_targets:
964 results = self._do_run(handle, final_targets, final_fetches,
--> 965 feed_dict_string, options, run_metadata)
966 else:
967 results = []
/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1013 if handle is None:
1014 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015 target_list, options, run_metadata)
1016 else:
1017 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
1020 def _do_call(self, fn, *args):
1021 try:
-> 1022 return fn(*args)
1023 except errors.OpError as e:
1024 message = compat.as_text(e.message)
/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1002 return tf_session.TF_Run(session, options,
1003 feed_dict, fetch_list, target_list,
-> 1004 status, run_metadata)
1005
1006 def _prun_fn(session, handle, feed_dict, fetch_list):
KeyboardInterrupt:
#resume training
model, model_name = get_best_model()
# model = load_model(CHECKPOINT_DIR + 'weights.025-0.4508.hdf5')
# model_name = 'weights.025-0.4508.hdf5'
# print('model_name', model_name)
# #try increasing learningrate
# optimizer = Adam(lr=1e-4)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1),
# EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1),
# ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True),
# TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)]
print('BATCH_SIZE:', BATCH_SIZE)
model.fit({'q1': train_q1_Double, 'q2': train_q2_Double}, y_train_Double,
batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks,
validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights),
shuffle=True, class_weight=class_weight, initial_epoch=)
model_name weights.002-0.2769.hdf5
Out[19]:
[0.27685219645314557, 0.81057303010965698]
Baseline_weights.002-0.2769.hdf5_LSTM64*1_DENSE64*1_valloss0.2769.csv
sys.stdout = open(OUTPUT_DIR+'training_output.txt', 'a')
history = model.fit({'q1': train_q1, 'q2': train_q2}, y_train, batch_size=BATCH_SIZE, epochs=3, verbose=2, callbacks=callbacks,
validation_data=({'q1': valid_q1, 'q2': valid_q2}, y_valid), shuffle=True, initial_epoch=0)
sys.stdout = sys.__stdout__
summary_stats = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
'train_acc': history.history['acc'],
'valid_acc': history.history['val_acc'],
'train_loss': history.history['loss'],
'valid_loss': history.history['val_loss']})
summary_stats
plt.plot(summary_stats.train_loss) # blue
plt.plot(summary_stats.valid_loss) # green
plt.show()
units = 128 # Number of nodes in the Dense layers
dropout = 0.25 # Percentage of nodes to drop
nb_filter = 32 # Number of filters to use in Convolution1D
filter_length = 3 # Length of filter for Convolution1D
# Initialize weights and biases for the Dense layers
weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
bias = bias_initializer='zeros'
model1 = Sequential()
model1.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model1.add(BatchNormalization())
model1.add(Activation('relu'))
model1.add(Dropout(dropout))
model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model1.add(BatchNormalization())
model1.add(Activation('relu'))
model1.add(Dropout(dropout))
model1.add(Flatten())
model2 = Sequential()
model2.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model2.add(BatchNormalization())
model2.add(Activation('relu'))
model2.add(Dropout(dropout))
model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model2.add(BatchNormalization())
model2.add(Activation('relu'))
model2.add(Dropout(dropout))
model2.add(Flatten())
model3 = Sequential()
model3.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model3.add(TimeDistributed(Dense(EMBEDDING_DIM)))
model3.add(BatchNormalization())
model3.add(Activation('relu'))
model3.add(Dropout(dropout))
model3.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
model4 = Sequential()
model4.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model4.add(TimeDistributed(Dense(EMBEDDING_DIM)))
model4.add(BatchNormalization())
model4.add(Activation('relu'))
model4.add(Dropout(dropout))
model4.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
modela = Sequential()
modela.add(Merge([model1, model2], mode='concat'))
modela.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
modela.add(BatchNormalization())
modela.add(Activation('relu'))
modela.add(Dropout(dropout))
modela.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
modela.add(BatchNormalization())
modela.add(Activation('relu'))
modela.add(Dropout(dropout))
modelb = Sequential()
modelb.add(Merge([model3, model4], mode='concat'))
modelb.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
modelb.add(BatchNormalization())
modelb.add(Activation('relu'))
modelb.add(Dropout(dropout))
modelb.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
modelb.add(BatchNormalization())
modelb.add(Activation('relu'))
modelb.add(Dropout(dropout))
model = Sequential()
model.add(Merge([modela, modelb], mode='concat'))
model.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(1, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))