CV: Feature Dropout

Try eliminating a random subset of features to check for possible overfitting.

Imports


In [ ]:
from pygoose import *

In [ ]:
import gc

In [ ]:
import lightgbm as lgb

In [ ]:
from sklearn.model_selection import StratifiedKFold

Config


In [ ]:
project = kg.Project.discover()

Model-specific parameters.


In [ ]:
NUM_FOLDS = 5

Search-specific parameters.


In [ ]:
DROPOUT_FEATURE_FRACTION = 0.2
NUM_IMPORTANCE_BINS = 5
NUM_SEARCH_ITERATIONS = 50

Make subsequent runs consistent and reproducible.


In [ ]:
RANDOM_SEED = 100500
np.random.seed(RANDOM_SEED)

Read Data


In [ ]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'tfidf',
    'lda',
    'nlp_tags',
    'wordnet_similarity',
    'phrase_embedding',
    'wmd',
    'wm_intersect',
    
    '3rdparty_abhishek',
    '3rdparty_dasolmar_whq',
    '3rdparty_mephistopheies',
    '3rdparty_image_similarity',
    
    'magic_pagerank',
    'magic_frequencies',
    'magic_cooccurrence_matrix',
    
    'oofp_nn_mlp_with_magic',
    'oofp_nn_cnn_with_magic',
    'oofp_nn_bi_lstm_with_magic',
    'oofp_nn_siamese_lstm_attention',
]

In [ ]:
df_train, df_test, _ = project.load_feature_lists(feature_lists)

In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')

Compute dropout probabilities


In [ ]:
gbm_importances = {
    # Place prior feature importances here
}

In [ ]:
imps = pd.DataFrame(
    [[feature, importance] for feature, importance in gbm_importances.items()],
    columns=['feature', 'importance'],
)

In [ ]:
imps['importance_bin'] = pd.cut(imps['importance'], NUM_IMPORTANCE_BINS, labels=list(range(1, NUM_IMPORTANCE_BINS + 1)))

In [ ]:
importance_bin = dict(zip(imps['feature'], imps['importance_bin']))

In [ ]:
dropout_probs = np.array([
    1 / importance_bin.get(feature_name, NUM_IMPORTANCE_BINS // 2 + 1)
    for feature_name in df_train.columns.tolist()
])

Normalize so that the vector sums up to 1


In [ ]:
dropout_probs *= (1 / np.sum(dropout_probs))

In [ ]:
def run_experiment(dropout_feature_list):
    X_train = df_train.drop(dropout_feature_list, axis=1).values
    
    kfold = StratifiedKFold(
        n_splits=NUM_FOLDS,
        shuffle=True,
        random_state=RANDOM_SEED
    )

    experiment_scores = []

    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
        X_fold_train = X_train[ix_train]
        X_fold_val = X_train[ix_val]

        y_fold_train = y_train[ix_train]
        y_fold_val = y_train[ix_val]

        lgb_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt',
            'device': 'cpu',
            'feature_fraction': 0.5,
            'num_leaves': 64,
            'learning_rate': 0.03,
            'num_boost_round': 3000,
            'early_stopping_rounds': 5,
            'verbose': 1,
            'bagging_fraction_seed': RANDOM_SEED,
            'feature_fraction_seed': RANDOM_SEED,
        }

        lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
        lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
        evals_result = {}

        model = lgb.train(
            lgb_params,
            lgb_data_train,
            valid_sets=[lgb_data_train, lgb_data_val],
            evals_result=evals_result,
            num_boost_round=lgb_params['num_boost_round'],
            early_stopping_rounds=lgb_params['early_stopping_rounds'],
            verbose_eval=False,
        )

        fold_train_scores = evals_result['training'][lgb_params['metric']]
        fold_val_scores = evals_result['valid_1'][lgb_params['metric']]

        experiment_scores.append([
            fold_train_scores[-1],
            fold_val_scores[-1],
        ])

    # Compute final scores.
    final_experiment_score = np.mean(np.array(experiment_scores), axis=0)
    
    # Clean up.
    del X_train
    del model
    gc.collect()
    
    return [
        dropout_feature_list,
        final_experiment_score[0],
        final_experiment_score[1],
    ]

In [ ]:
all_experiments_log = []

In [ ]:
for i in range(NUM_SEARCH_ITERATIONS):
    print(f'Iteration {i + 1} of {NUM_SEARCH_ITERATIONS}')
    
    dropout_list = np.random.choice(
        df_train.columns,
        size=int(len(df_train.columns) * DROPOUT_FEATURE_FRACTION),
        replace=False,
        p=dropout_probs,
    )
    
    print(f'Removing {dropout_list}')
    experiment_result = run_experiment(dropout_list)
    _, result_train, result_val = experiment_result
    
    print(f'Train: {result_train:.6f}   Val: {result_val:.6f}   Diff: {result_val - result_train:.6f}')
    all_experiments_log.append(experiment_result)
    pd.DataFrame(all_experiments_log).to_csv(project.temp_dir + 'dropout_experiments.log', index=None)
    
    print()