In addition to the MLP architecture, we'll append some of the leaky features to the intermediate feature layer.
This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import gc
In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
In [5]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint
Automatically discover the paths to various data folders and compose the project structure.
In [6]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [7]:
feature_list_id = 'oofp_nn_mlp_with_magic'
Make subsequent NN runs reproducible.
In [8]:
RANDOM_SEED = 42
In [9]:
np.random.seed(RANDOM_SEED)
Word embedding lookup matrix.
In [10]:
embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')
Padded sequences of word indices for every question.
In [11]:
X_train_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_train.pickle')
X_train_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_train.pickle')
In [12]:
X_test_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_test.pickle')
X_test_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_test.pickle')
In [13]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
Magic features.
In [14]:
magic_feature_lists = [
'magic_frequencies',
'magic_cooccurrence_matrix',
]
In [15]:
X_train_magic, X_test_magic, _ = project.load_feature_lists(magic_feature_lists)
In [16]:
X_train_magic = X_train_magic.values
X_test_magic = X_test_magic.values
In [17]:
scaler = StandardScaler()
scaler.fit(np.vstack([X_train_magic, X_test_magic]))
X_train_magic = scaler.transform(X_train_magic)
X_test_magic = scaler.transform(X_test_magic)
Word embedding properties.
In [18]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]
In [19]:
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)
In [20]:
def create_model_question_branch():
input_q = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_q = Embedding(
VOCAB_LENGTH,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False,
)(input_q)
timedist_q = TimeDistributed(Dense(
EMBEDDING_DIM,
activation='relu',
))(embedding_q)
lambda_q = Lambda(
lambda x: K.max(x, axis=1),
output_shape=(EMBEDDING_DIM, )
)(timedist_q)
output_q = lambda_q
return input_q, output_q
In [21]:
def create_model(params):
q1_input, q1_output = create_model_question_branch()
q2_input, q2_output = create_model_question_branch()
magic_input = Input(shape=(X_train_magic.shape[-1], ))
merged_inputs = concatenate([q1_output, q2_output, magic_input])
dense_1 = Dense(params['num_dense_1'])(merged_inputs)
bn_1 = BatchNormalization()(dense_1)
relu_1 = Activation('relu')(bn_1)
dense_2 = Dense(params['num_dense_2'])(relu_1)
bn_2 = BatchNormalization()(dense_2)
relu_2 = Activation('relu')(bn_2)
dense_3 = Dense(params['num_dense_3'])(relu_2)
bn_3 = BatchNormalization()(dense_3)
relu_3 = Activation('relu')(bn_3)
dense_4 = Dense(params['num_dense_4'])(relu_3)
bn_4 = BatchNormalization()(dense_4)
relu_4 = Activation('relu')(bn_4)
bn_final = BatchNormalization()(relu_4)
output = Dense(1, activation='sigmoid')(bn_final)
model = Model(
inputs=[q1_input, q2_input, magic_input],
outputs=output,
)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(lr=0.01),
metrics=['accuracy']
)
return model
In [22]:
def predict(model, X_q1, X_q2, X_magic):
"""
Mirror the pairs, compute two separate predictions, and average them.
"""
y1 = model.predict([X_q1, X_q2, X_magic], batch_size=1024, verbose=1).reshape(-1)
y2 = model.predict([X_q2, X_q1, X_magic], batch_size=1024, verbose=1).reshape(-1)
return (y1 + y2) / 2
In [23]:
NUM_FOLDS = 5
In [24]:
kfold = StratifiedKFold(
n_splits=NUM_FOLDS,
shuffle=True,
random_state=RANDOM_SEED
)
Create placeholders for out-of-fold predictions.
In [25]:
y_train_oofp = np.zeros_like(y_train, dtype='float64')
In [26]:
y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS))
In [27]:
BATCH_SIZE = 2048
In [28]:
MAX_EPOCHS = 200
In [29]:
model_params = {
'num_dense_1': 400,
'num_dense_2': 200,
'num_dense_3': 400,
'num_dense_4': 100,
}
The path where the best weights of the current model will be saved.
In [30]:
model_checkpoint_path = project.temp_dir + 'fold-checkpoint-' + feature_list_id + '.h5'
In [31]:
%%time
# Iterate through folds.
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
# Augment the training set by mirroring the pairs.
X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])
X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])
X_fold_train_magic = np.vstack([X_train_magic[ix_train], X_train_magic[ix_train]])
X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])
X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])
X_fold_val_magic = np.vstack([X_train_magic[ix_val], X_train_magic[ix_val]])
# Ground truth should also be "mirrored".
y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])
print()
print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
print()
# Compile a new model.
model = create_model(model_params)
# Train.
model.fit(
[X_fold_train_q1, X_fold_train_q2, X_fold_train_magic], y_fold_train,
validation_data=([X_fold_val_q1, X_fold_val_q2, X_fold_val_magic], y_fold_val),
batch_size=BATCH_SIZE,
epochs=MAX_EPOCHS,
verbose=1,
callbacks=[
# Stop training when the validation loss stops improving.
EarlyStopping(
monitor='val_loss',
min_delta=0.001,
patience=3,
verbose=1,
mode='auto',
),
# Save the weights of the best epoch.
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
verbose=2,
),
],
)
# Restore the best epoch.
model.load_weights(model_checkpoint_path)
# Compute out-of-fold predictions.
y_train_oofp[ix_val] = predict(model, X_train_q1[ix_val], X_train_q2[ix_val], X_train_magic[ix_val])
y_test_oofp[:, fold_num] = predict(model, X_test_q1, X_test_q2, X_test_magic)
# Clear GPU memory.
K.clear_session()
del X_fold_train_q1, X_fold_train_q2, X_fold_train_magic
del X_fold_val_q1, X_fold_val_q2, X_fold_val_magic
del model
gc.collect()
In [32]:
cv_score = log_loss(y_train, y_train_oofp)
print('CV score:', cv_score)
In [33]:
feature_names = [feature_list_id]
In [34]:
features_train = y_train_oofp.reshape((-1, 1))
In [35]:
features_test = np.mean(y_test_oofp, axis=1).reshape((-1, 1))
In [36]:
project.save_features(features_train, features_test, feature_names, feature_list_id)
In [37]:
pd.DataFrame(features_test).plot.hist()
Out[37]: