Feature: Out-Of-Fold Predictions and Feature Layer Activations from an LSTM

In addition to the output of the final network layer, the model will also output the activations of the intermediate feature layer.

To achieve this, we'll create a multi-output network (target output + activations output), and supply dummy ground truth and a dummy loss function to the second output.

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.



In [ ]:

    
from pygoose import *



In [ ]:

    
import gc



In [ ]:

    
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *



In [ ]:

    
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

Config

Automatically discover the paths to various data folders and compose the project structure.



In [ ]:

    
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.



In [ ]:

    
feature_list_id = 'oofp_nn_lstm_with_activations'

Make subsequent NN runs reproducible.



In [ ]:

    
RANDOM_SEED = 42



In [ ]:

    
np.random.seed(RANDOM_SEED)

Read data

Word embedding lookup matrix.



In [ ]:

    
embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')

Padded sequences of word indices for every question.



In [ ]:

    
X_train_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_train.pickle')
X_train_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_train.pickle')



In [ ]:

    
X_test_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_test.pickle')
X_test_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_test.pickle')



In [ ]:

    
y_train = kg.io.load(project.features_dir + 'y_train.pickle')

Word embedding properties.



In [ ]:

    
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]



In [ ]:

    
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)

Define models



In [ ]:

    
def zero_loss(y_true, y_pred):
    return K.zeros((1,))



In [ ]:

    
def create_model_question_branch():
    input_q = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    
    embedding_q = Embedding(
        VOCAB_LENGTH,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    )(input_q)

    timedist_q = TimeDistributed(Dense(
        EMBEDDING_DIM,
        activation='relu',
    ))(embedding_q)

    lambda_q = Lambda(
        lambda x: K.max(x, axis=1),
        output_shape=(EMBEDDING_DIM, )
    )(timedist_q)
    
    output_q = lambda_q
    return input_q, output_q



In [ ]:

    
def create_model(params):    
    embedding_layer = Embedding(
        VOCAB_LENGTH,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    )
    lstm_layer = LSTM(
        params['num_lstm'],
        dropout=params['lstm_dropout_rate'],
        recurrent_dropout=params['lstm_dropout_rate'],
    )

    input_q1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(input_q1)
    x1 = lstm_layer(embedded_sequences_1)

    input_q2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(input_q2)
    y1 = lstm_layer(embedded_sequences_2)

    features = Concatenate(name='feature_output')([x1, y1])
    dropout_feat = Dropout(params['dense_dropout_rate'])(features)
    bn_feat = BatchNormalization()(dropout_feat)

    dense_1 = Dense(params['num_dense'], activation='relu')(bn_feat)
    dropout_1 = Dropout(params['dense_dropout_rate'])(dense_1)
    bn_1 = BatchNormalization()(dropout_1)

    output = Dense(1, activation='sigmoid', name='target_output')(bn_1)

    model = Model(
        inputs=[input_q1, input_q2],
        outputs=[output, features],
    )
    
    model.compile(
        loss={'target_output': 'binary_crossentropy', 'feature_output': zero_loss},
        loss_weights={'target_output': 1.0, 'feature_output': 0.0},
        optimizer='nadam',
        metrics=None,
    )

    return model



In [ ]:

    
def predict(model, X_q1, X_q2):
    """
    Mirror the pairs, compute two separate predictions, and average them.
    """
    
    y1 = model.predict([X_q1, X_q2], batch_size=1024, verbose=1).reshape(-1)   
    y2 = model.predict([X_q2, X_q1], batch_size=1024, verbose=1).reshape(-1)    
    return (y1 + y2) / 2

Partition the data



In [ ]:

    
NUM_FOLDS = 5



In [ ]:

    
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

Define hyperparameters



In [ ]:

    
BATCH_SIZE = 2048



In [ ]:

    
MAX_EPOCHS = 200

Best values picked by Bayesian optimization.



In [ ]:

    
model_params = {
    'dense_dropout_rate': 0.075,
    'lstm_dropout_rate': 0.332,
    'num_dense': 130,
    'num_lstm': 300,
}



In [ ]:

    
feature_output_size = model_params['num_lstm'] * 2

Create placeholders for out-of-fold predictions.



In [ ]:

    
y_train_oofp = np.zeros_like(y_train, dtype='float32')
y_train_oofp_features = np.zeros((len(y_train), feature_output_size), dtype='float32')



In [ ]:

    
y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS), dtype='float32')
y_test_oofp_features = np.zeros((len(X_test_q1), feature_output_size), dtype='float32')

The path where the best weights of the current model will be saved.



In [ ]:

    
model_checkpoint_path = project.temp_dir + 'fold-checkpoint-' + feature_list_id + '.h5'

Fit the folds and compute out-of-fold predictions



In [ ]:

    
%%time

# Iterate through folds.
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
    
    # Augment the training set by mirroring the pairs.
    X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])
    X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])

    X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])
    X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])

    # Ground truth should also be "mirrored".
    y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
    y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])
    
    print()
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    print()
    
    # Compile a new model.
    model = create_model(model_params)

    # Train.
    model.fit(
        # Create dummy ground truth values for the activation outputs.
        [X_fold_train_q1, X_fold_train_q2],
        [y_fold_train, np.zeros((len(y_fold_train), feature_output_size))],
        
        validation_data=(
            [X_fold_val_q1, X_fold_val_q2],
            [y_fold_val, np.zeros((len(y_fold_val), feature_output_size))],
        ),

        batch_size=BATCH_SIZE,
        epochs=MAX_EPOCHS,
        verbose=1,
        
        callbacks=[
            # Stop training when the validation loss stops improving.
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.001,
                patience=3,
                verbose=1,
                mode='auto',
            ),
            # Save the weights of the best epoch.
            ModelCheckpoint(
                model_checkpoint_path,
                monitor='val_loss',
                save_best_only=True,
                verbose=2,
            ),
        ],
    )
        
    # Restore the best epoch.
    model.load_weights(model_checkpoint_path)
    
    # Compute out-of-fold predictions.
    y_train_oofp[ix_val] = predict(model, X_train_q1[ix_val], X_train_q2[ix_val])
    y_test_oofp[:, fold_num] = predict(model, X_test_q1, X_test_q2)
    
    # Clear GPU memory.
    K.clear_session()
    del X_fold_train_q1, X_fold_train_q2
    del X_fold_val_q1, X_fold_val_q2
    del model
    gc.collect()



In [ ]:

    
cv_score = log_loss(y_train, y_train_oofp)
print('CV score:', cv_score)

Save features



In [ ]:

    
feature_names = [feature_list_id]



In [ ]:

    
features_train = y_train_oofp.reshape((-1, 1))



In [ ]:

    
features_test = np.mean(y_test_oofp, axis=1).reshape((-1, 1))



In [ ]:

    
project.save_features(features_train, features_test, feature_names, feature_list_id)