Use data clean from script_LSTM.py
LSTM(64)
DENSE(64)
BATCH_SIZE = 256
weights.002-0.2777.hdf5
212s - loss: 0.2390 - acc: 0.8283 - val_loss: 0.2777 - val_acc: 0.8053
LSTM(128,0.5,0.5)
DENSE(128,0.5)
BatchNormalization()
BATCH_SIZE = 2048
weights.022-0.2778.hdf5
111s - loss: 0.2682 - acc: 0.7932 - val_loss: 0.2778 - val_acc: 0.7855
LSTM(128,0.5,0.5)
DENSE(128,0.5)
BATCH_SIZE = 2048
weights.025-0.2798.hdf5
110s - loss: 0.2660 - acc: 0.7969 - val_loss: 0.2798 - val_acc: 0.7826
Using TensorFlow backend.
# Check for any null values
# inds = pd.isnull(trainval_df).any(1).nonzero()[0]
# trainval_df.loc[inds]
# inds = pd.isnull(test_df).any(1).nonzero()[0]
# test_df.loc[inds]
# # Add the string 'empty' to empty strings
# trainval_df = trainval_df.fillna('empty')
# test_df = test_df.fillna('empty')
Generating file trainval_df.pickle
Generating file test_df.pickle
# trainval_df['len1'] = trainval_df.apply(lambda row: len(row['question1_WL'].split()), axis=1)
# trainval_df['len2'] = trainval_df.apply(lambda row: len(row['question2_WL'].split()), axis=1)
test_df['len1'] = test_df.apply(lambda row: len(row['question1_WL'].split()), axis=1)
test_df['len2'] = test_df.apply(lambda row: len(row['question2_WL'].split()), axis=1)
lengths = pd.concat([test_df['len1'],test_df['len2']], axis=0)
print(lengths.describe())
print(np.percentile(lengths, 99.0))
print(np.percentile(lengths, 99.4))
print(np.percentile(lengths, 99.5))
print(np.percentile(lengths, 99.9))
Generating file glove.840B.300d.txt.word_embedding_matrix.pickle
Word embeddings: 1505774
Null word embeddings: 37412
word_counts = tokenizer.word_counts
null_embedding_word_counts = { word: word_counts[word] for word in null_embedding_words }
print(sum(null_embedding_word_counts.values())) #454210
word_docs = tokenizer.word_docs
null_embedding_word_docs = { word: word_docs[word] for word in null_embedding_words }
print(sum(null_embedding_word_docs.values())) #446584
# 446584/(404290+2345796)/2 = 0.08119
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_6 (Embedding) (None, 40, 300) 36178500
_________________________________________________________________
lstm_6 (LSTM) (None, 128) 219648
=================================================================
Total params: 36,398,148.0
Trainable params: 219,648.0
Non-trainable params: 36,178,500.0
_________________________________________________________________
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
q1 (InputLayer) (None, 40) 0
____________________________________________________________________________________________________
q2 (InputLayer) (None, 40) 0
____________________________________________________________________________________________________
sequential_9 (Sequential) (None, 128) 36398148
____________________________________________________________________________________________________
concatenate_6 (Concatenate) (None, 256) 0
____________________________________________________________________________________________________
dropout_12 (Dropout) (None, 256) 0
____________________________________________________________________________________________________
dense_12 (Dense) (None, 128) 32896
____________________________________________________________________________________________________
dropout_13 (Dropout) (None, 128) 0
____________________________________________________________________________________________________
dense_13 (Dense) (None, 1) 129
====================================================================================================
Total params: 36,431,173.0
Trainable params: 252,673.0
Non-trainable params: 36,178,500.0
____________________________________________________________________________________________________
BATCH_SIZE: 2048
Train on 770048 samples, validate on 38532 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.36693, saving model to ./checkpoint/weights.000-0.3669.hdf5
120s - loss: 0.4086 - acc: 0.6682 - val_loss: 0.3669 - val_acc: 0.6945
Epoch 2/100
Epoch 00001: val_loss improved from 0.36693 to 0.34885, saving model to ./checkpoint/weights.001-0.3489.hdf5
110s - loss: 0.3662 - acc: 0.7073 - val_loss: 0.3489 - val_acc: 0.7372
Epoch 3/100
Epoch 00002: val_loss improved from 0.34885 to 0.33290, saving model to ./checkpoint/weights.002-0.3329.hdf5
110s - loss: 0.3488 - acc: 0.7225 - val_loss: 0.3329 - val_acc: 0.7380
Epoch 4/100
Epoch 00003: val_loss improved from 0.33290 to 0.32665, saving model to ./checkpoint/weights.003-0.3267.hdf5
110s - loss: 0.3375 - acc: 0.7317 - val_loss: 0.3267 - val_acc: 0.7467
Epoch 5/100
Epoch 00004: val_loss improved from 0.32665 to 0.31697, saving model to ./checkpoint/weights.004-0.3170.hdf5
110s - loss: 0.3288 - acc: 0.7386 - val_loss: 0.3170 - val_acc: 0.7451
Epoch 6/100
Epoch 00005: val_loss improved from 0.31697 to 0.31174, saving model to ./checkpoint/weights.005-0.3117.hdf5
110s - loss: 0.3216 - acc: 0.7454 - val_loss: 0.3117 - val_acc: 0.7542
Epoch 7/100
Epoch 00006: val_loss improved from 0.31174 to 0.30906, saving model to ./checkpoint/weights.006-0.3091.hdf5
110s - loss: 0.3156 - acc: 0.7510 - val_loss: 0.3091 - val_acc: 0.7563
Epoch 8/100
Epoch 00007: val_loss improved from 0.30906 to 0.30893, saving model to ./checkpoint/weights.007-0.3089.hdf5
110s - loss: 0.3098 - acc: 0.7560 - val_loss: 0.3089 - val_acc: 0.7658
Epoch 9/100
Epoch 00008: val_loss improved from 0.30893 to 0.30424, saving model to ./checkpoint/weights.008-0.3042.hdf5
110s - loss: 0.3056 - acc: 0.7597 - val_loss: 0.3042 - val_acc: 0.7729
Epoch 10/100
Epoch 00009: val_loss improved from 0.30424 to 0.29670, saving model to ./checkpoint/weights.009-0.2967.hdf5
110s - loss: 0.3015 - acc: 0.7636 - val_loss: 0.2967 - val_acc: 0.7691
Epoch 11/100
Epoch 00010: val_loss improved from 0.29670 to 0.29603, saving model to ./checkpoint/weights.010-0.2960.hdf5
110s - loss: 0.2974 - acc: 0.7669 - val_loss: 0.2960 - val_acc: 0.7715
Epoch 12/100
Epoch 00011: val_loss improved from 0.29603 to 0.29311, saving model to ./checkpoint/weights.011-0.2931.hdf5
110s - loss: 0.2945 - acc: 0.7699 - val_loss: 0.2931 - val_acc: 0.7709
Epoch 13/100
Epoch 00012: val_loss did not improve
110s - loss: 0.2912 - acc: 0.7730 - val_loss: 0.2949 - val_acc: 0.7748
Epoch 14/100
Epoch 00013: val_loss did not improve
110s - loss: 0.2884 - acc: 0.7754 - val_loss: 0.2975 - val_acc: 0.7789
Epoch 15/100
Epoch 00014: val_loss improved from 0.29311 to 0.29163, saving model to ./checkpoint/weights.014-0.2916.hdf5
110s - loss: 0.2855 - acc: 0.7782 - val_loss: 0.2916 - val_acc: 0.7777
Epoch 16/100
Epoch 00015: val_loss improved from 0.29163 to 0.28918, saving model to ./checkpoint/weights.015-0.2892.hdf5
110s - loss: 0.2835 - acc: 0.7803 - val_loss: 0.2892 - val_acc: 0.7813
Epoch 17/100
Epoch 00016: val_loss improved from 0.28918 to 0.28915, saving model to ./checkpoint/weights.016-0.2892.hdf5
110s - loss: 0.2815 - acc: 0.7816 - val_loss: 0.2892 - val_acc: 0.7837
Epoch 18/100
Epoch 00017: val_loss did not improve
110s - loss: 0.2791 - acc: 0.7835 - val_loss: 0.2913 - val_acc: 0.7830
Epoch 19/100
Epoch 00018: val_loss did not improve
110s - loss: 0.2769 - acc: 0.7864 - val_loss: 0.2894 - val_acc: 0.7864
Epoch 20/100
Epoch 00019: val_loss improved from 0.28915 to 0.28266, saving model to ./checkpoint/weights.019-0.2827.hdf5
110s - loss: 0.2753 - acc: 0.7877 - val_loss: 0.2827 - val_acc: 0.7795
Epoch 21/100
Epoch 00020: val_loss did not improve
110s - loss: 0.2735 - acc: 0.7897 - val_loss: 0.2848 - val_acc: 0.7843
Epoch 22/100
Epoch 00021: val_loss did not improve
110s - loss: 0.2719 - acc: 0.7912 - val_loss: 0.2865 - val_acc: 0.7894
Epoch 23/100
Epoch 00022: val_loss improved from 0.28266 to 0.28210, saving model to ./checkpoint/weights.022-0.2821.hdf5
110s - loss: 0.2706 - acc: 0.7930 - val_loss: 0.2821 - val_acc: 0.7872
Epoch 24/100
Epoch 00023: val_loss did not improve
110s - loss: 0.2688 - acc: 0.7943 - val_loss: 0.2877 - val_acc: 0.7927
Epoch 25/100
Epoch 00024: val_loss improved from 0.28210 to 0.28169, saving model to ./checkpoint/weights.024-0.2817.hdf5
110s - loss: 0.2674 - acc: 0.7953 - val_loss: 0.2817 - val_acc: 0.7895
Epoch 26/100
Epoch 00025: val_loss improved from 0.28169 to 0.27978, saving model to ./checkpoint/weights.025-0.2798.hdf5
110s - loss: 0.2660 - acc: 0.7969 - val_loss: 0.2798 - val_acc: 0.7826
Epoch 27/100
Epoch 00026: val_loss did not improve
110s - loss: 0.2647 - acc: 0.7979 - val_loss: 0.2865 - val_acc: 0.7964
Epoch 28/100
Epoch 00027: val_loss did not improve
110s - loss: 0.2632 - acc: 0.7987 - val_loss: 0.2822 - val_acc: 0.7901
Epoch 29/100
Epoch 00028: val_loss did not improve
110s - loss: 0.2626 - acc: 0.7999 - val_loss: 0.2842 - val_acc: 0.7968
Epoch 30/100
Epoch 00029: val_loss did not improve
110s - loss: 0.2610 - acc: 0.8016 - val_loss: 0.2873 - val_acc: 0.7985
Epoch 31/100
Epoch 00030: val_loss did not improve
110s - loss: 0.2603 - acc: 0.8019 - val_loss: 0.2828 - val_acc: 0.7964
Epoch 32/100
Epoch 00031: val_loss did not improve
110s - loss: 0.2583 - acc: 0.8037 - val_loss: 0.2824 - val_acc: 0.7944
Epoch 00031: early stopping
Out[44]:
<keras.callbacks.History at 0x7fe77d401cd0>
#resume training
model, model_name = get_best_model()
# model = load_model(CHECKPOINT_DIR + 'weights.025-0.4508.hdf5')
# model_name = 'weights.025-0.4508.hdf5'
# print('model_name', model_name)
# #try increasing learningrate
# optimizer = Adam(lr=1e-4)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1),
# EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1),
# ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True),
# TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)]
print('BATCH_SIZE:', BATCH_SIZE)
model.fit({'q1': train_q1_Double, 'q2': train_q2_Double}, y_train_Double,
batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks,
validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights),
shuffle=True, class_weight=class_weight, initial_epoch=)
model_name weights.022-0.2778.hdf5
Out[38]:
[0.27775588646789434, 0.78547702802869745]
DataClean_weights.022-0.2778.hdf5_LSTM128*1_DENSE128*1_valloss0.2778.csv
sys.stdout = open(OUTPUT_DIR+'training_output.txt', 'a')
history = model.fit({'q1': train_q1, 'q2': train_q2}, y_train, batch_size=BATCH_SIZE, epochs=3, verbose=2, callbacks=callbacks,
validation_data=({'q1': valid_q1, 'q2': valid_q2}, y_valid), shuffle=True, initial_epoch=0)
sys.stdout = sys.__stdout__
summary_stats = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
'train_acc': history.history['acc'],
'valid_acc': history.history['val_acc'],
'train_loss': history.history['loss'],
'valid_loss': history.history['val_loss']})
summary_stats
plt.plot(summary_stats.train_loss) # blue
plt.plot(summary_stats.valid_loss) # green
plt.show()
units = 128 # Number of nodes in the Dense layers
dropout = 0.25 # Percentage of nodes to drop
nb_filter = 32 # Number of filters to use in Convolution1D
filter_length = 3 # Length of filter for Convolution1D
# Initialize weights and biases for the Dense layers
weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
bias = bias_initializer='zeros'
model1 = Sequential()
model1.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model1.add(BatchNormalization())
model1.add(Activation('relu'))
model1.add(Dropout(dropout))
model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model1.add(BatchNormalization())
model1.add(Activation('relu'))
model1.add(Dropout(dropout))
model1.add(Flatten())
model2 = Sequential()
model2.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model2.add(BatchNormalization())
model2.add(Activation('relu'))
model2.add(Dropout(dropout))
model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same'))
model2.add(BatchNormalization())
model2.add(Activation('relu'))
model2.add(Dropout(dropout))
model2.add(Flatten())
model3 = Sequential()
model3.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model3.add(TimeDistributed(Dense(EMBEDDING_DIM)))
model3.add(BatchNormalization())
model3.add(Activation('relu'))
model3.add(Dropout(dropout))
model3.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
model4 = Sequential()
model4.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False))
model4.add(TimeDistributed(Dense(EMBEDDING_DIM)))
model4.add(BatchNormalization())
model4.add(Activation('relu'))
model4.add(Dropout(dropout))
model4.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
modela = Sequential()
modela.add(Merge([model1, model2], mode='concat'))
modela.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
modela.add(BatchNormalization())
modela.add(Activation('relu'))
modela.add(Dropout(dropout))
modela.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
modela.add(BatchNormalization())
modela.add(Activation('relu'))
modela.add(Dropout(dropout))
modelb = Sequential()
modelb.add(Merge([model3, model4], mode='concat'))
modelb.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
modelb.add(BatchNormalization())
modelb.add(Activation('relu'))
modelb.add(Dropout(dropout))
modelb.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
modelb.add(BatchNormalization())
modelb.add(Activation('relu'))
modelb.add(Dropout(dropout))
model = Sequential()
model.add(Merge([modela, modelb], mode='concat'))
model.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(1, kernel_initializer=weights, bias_initializer=bias))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))