In addition to the output of the final network layer, the model will also output the activations of the intermediate feature layer.
To achieve this, we'll create a multi-output network (target output + activations output), and supply dummy ground truth and a dummy loss function to the second output.
This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [ ]:
from pygoose import *
In [ ]:
import gc
In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
In [ ]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint
Automatically discover the paths to various data folders and compose the project structure.
In [ ]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [ ]:
feature_list_id = 'oofp_nn_lstm_with_activations'
Make subsequent NN runs reproducible.
In [ ]:
RANDOM_SEED = 42
In [ ]:
np.random.seed(RANDOM_SEED)
Word embedding lookup matrix.
In [ ]:
embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')
Padded sequences of word indices for every question.
In [ ]:
X_train_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_train.pickle')
X_train_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_train.pickle')
In [ ]:
X_test_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_test.pickle')
X_test_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_test.pickle')
In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
Word embedding properties.
In [ ]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]
In [ ]:
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)
In [ ]:
def zero_loss(y_true, y_pred):
return K.zeros((1,))
In [ ]:
def create_model_question_branch():
input_q = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_q = Embedding(
VOCAB_LENGTH,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False,
)(input_q)
timedist_q = TimeDistributed(Dense(
EMBEDDING_DIM,
activation='relu',
))(embedding_q)
lambda_q = Lambda(
lambda x: K.max(x, axis=1),
output_shape=(EMBEDDING_DIM, )
)(timedist_q)
output_q = lambda_q
return input_q, output_q
In [ ]:
def create_model(params):
embedding_layer = Embedding(
VOCAB_LENGTH,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False,
)
lstm_layer = LSTM(
params['num_lstm'],
dropout=params['lstm_dropout_rate'],
recurrent_dropout=params['lstm_dropout_rate'],
)
input_q1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(input_q1)
x1 = lstm_layer(embedded_sequences_1)
input_q2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(input_q2)
y1 = lstm_layer(embedded_sequences_2)
features = Concatenate(name='feature_output')([x1, y1])
dropout_feat = Dropout(params['dense_dropout_rate'])(features)
bn_feat = BatchNormalization()(dropout_feat)
dense_1 = Dense(params['num_dense'], activation='relu')(bn_feat)
dropout_1 = Dropout(params['dense_dropout_rate'])(dense_1)
bn_1 = BatchNormalization()(dropout_1)
output = Dense(1, activation='sigmoid', name='target_output')(bn_1)
model = Model(
inputs=[input_q1, input_q2],
outputs=[output, features],
)
model.compile(
loss={'target_output': 'binary_crossentropy', 'feature_output': zero_loss},
loss_weights={'target_output': 1.0, 'feature_output': 0.0},
optimizer='nadam',
metrics=None,
)
return model
In [ ]:
def predict(model, X_q1, X_q2):
"""
Mirror the pairs, compute two separate predictions, and average them.
"""
y1 = model.predict([X_q1, X_q2], batch_size=1024, verbose=1).reshape(-1)
y2 = model.predict([X_q2, X_q1], batch_size=1024, verbose=1).reshape(-1)
return (y1 + y2) / 2
In [ ]:
NUM_FOLDS = 5
In [ ]:
kfold = StratifiedKFold(
n_splits=NUM_FOLDS,
shuffle=True,
random_state=RANDOM_SEED
)
In [ ]:
BATCH_SIZE = 2048
In [ ]:
MAX_EPOCHS = 200
Best values picked by Bayesian optimization.
In [ ]:
model_params = {
'dense_dropout_rate': 0.075,
'lstm_dropout_rate': 0.332,
'num_dense': 130,
'num_lstm': 300,
}
In [ ]:
feature_output_size = model_params['num_lstm'] * 2
Create placeholders for out-of-fold predictions.
In [ ]:
y_train_oofp = np.zeros_like(y_train, dtype='float32')
y_train_oofp_features = np.zeros((len(y_train), feature_output_size), dtype='float32')
In [ ]:
y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS), dtype='float32')
y_test_oofp_features = np.zeros((len(X_test_q1), feature_output_size), dtype='float32')
The path where the best weights of the current model will be saved.
In [ ]:
model_checkpoint_path = project.temp_dir + 'fold-checkpoint-' + feature_list_id + '.h5'
In [ ]:
%%time
# Iterate through folds.
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
# Augment the training set by mirroring the pairs.
X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])
X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])
X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])
X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])
# Ground truth should also be "mirrored".
y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])
print()
print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
print()
# Compile a new model.
model = create_model(model_params)
# Train.
model.fit(
# Create dummy ground truth values for the activation outputs.
[X_fold_train_q1, X_fold_train_q2],
[y_fold_train, np.zeros((len(y_fold_train), feature_output_size))],
validation_data=(
[X_fold_val_q1, X_fold_val_q2],
[y_fold_val, np.zeros((len(y_fold_val), feature_output_size))],
),
batch_size=BATCH_SIZE,
epochs=MAX_EPOCHS,
verbose=1,
callbacks=[
# Stop training when the validation loss stops improving.
EarlyStopping(
monitor='val_loss',
min_delta=0.001,
patience=3,
verbose=1,
mode='auto',
),
# Save the weights of the best epoch.
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
verbose=2,
),
],
)
# Restore the best epoch.
model.load_weights(model_checkpoint_path)
# Compute out-of-fold predictions.
y_train_oofp[ix_val] = predict(model, X_train_q1[ix_val], X_train_q2[ix_val])
y_test_oofp[:, fold_num] = predict(model, X_test_q1, X_test_q2)
# Clear GPU memory.
K.clear_session()
del X_fold_train_q1, X_fold_train_q2
del X_fold_val_q1, X_fold_val_q2
del model
gc.collect()
In [ ]:
cv_score = log_loss(y_train, y_train_oofp)
print('CV score:', cv_score)
In [ ]:
feature_names = [feature_list_id]
In [ ]:
features_train = y_train_oofp.reshape((-1, 1))
In [ ]:
features_test = np.mean(y_test_oofp, axis=1).reshape((-1, 1))
In [ ]:
project.save_features(features_train, features_test, feature_names, feature_list_id)