Try eliminating a random subset of features to check for possible overfitting.
In [ ]:
from pygoose import *
In [ ]:
import gc
In [ ]:
import lightgbm as lgb
In [ ]:
from sklearn.model_selection import StratifiedKFold
In [ ]:
project = kg.Project.discover()
Model-specific parameters.
In [ ]:
NUM_FOLDS = 5
Search-specific parameters.
In [ ]:
DROPOUT_FEATURE_FRACTION = 0.2
NUM_IMPORTANCE_BINS = 5
NUM_SEARCH_ITERATIONS = 50
Make subsequent runs consistent and reproducible.
In [ ]:
RANDOM_SEED = 100500
np.random.seed(RANDOM_SEED)
In [ ]:
feature_lists = [
'simple_summaries',
'jaccard_ngrams',
'fuzzy',
'tfidf',
'lda',
'nlp_tags',
'wordnet_similarity',
'phrase_embedding',
'wmd',
'wm_intersect',
'3rdparty_abhishek',
'3rdparty_dasolmar_whq',
'3rdparty_mephistopheies',
'3rdparty_image_similarity',
'magic_pagerank',
'magic_frequencies',
'magic_cooccurrence_matrix',
'oofp_nn_mlp_with_magic',
'oofp_nn_cnn_with_magic',
'oofp_nn_bi_lstm_with_magic',
'oofp_nn_siamese_lstm_attention',
]
In [ ]:
df_train, df_test, _ = project.load_feature_lists(feature_lists)
In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
In [ ]:
gbm_importances = {
# Place prior feature importances here
}
In [ ]:
imps = pd.DataFrame(
[[feature, importance] for feature, importance in gbm_importances.items()],
columns=['feature', 'importance'],
)
In [ ]:
imps['importance_bin'] = pd.cut(imps['importance'], NUM_IMPORTANCE_BINS, labels=list(range(1, NUM_IMPORTANCE_BINS + 1)))
In [ ]:
importance_bin = dict(zip(imps['feature'], imps['importance_bin']))
In [ ]:
dropout_probs = np.array([
1 / importance_bin.get(feature_name, NUM_IMPORTANCE_BINS // 2 + 1)
for feature_name in df_train.columns.tolist()
])
Normalize so that the vector sums up to 1
In [ ]:
dropout_probs *= (1 / np.sum(dropout_probs))
In [ ]:
def run_experiment(dropout_feature_list):
X_train = df_train.drop(dropout_feature_list, axis=1).values
kfold = StratifiedKFold(
n_splits=NUM_FOLDS,
shuffle=True,
random_state=RANDOM_SEED
)
experiment_scores = []
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
X_fold_train = X_train[ix_train]
X_fold_val = X_train[ix_val]
y_fold_train = y_train[ix_train]
y_fold_val = y_train[ix_val]
lgb_params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'device': 'cpu',
'feature_fraction': 0.5,
'num_leaves': 64,
'learning_rate': 0.03,
'num_boost_round': 3000,
'early_stopping_rounds': 5,
'verbose': 1,
'bagging_fraction_seed': RANDOM_SEED,
'feature_fraction_seed': RANDOM_SEED,
}
lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)
evals_result = {}
model = lgb.train(
lgb_params,
lgb_data_train,
valid_sets=[lgb_data_train, lgb_data_val],
evals_result=evals_result,
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
verbose_eval=False,
)
fold_train_scores = evals_result['training'][lgb_params['metric']]
fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
experiment_scores.append([
fold_train_scores[-1],
fold_val_scores[-1],
])
# Compute final scores.
final_experiment_score = np.mean(np.array(experiment_scores), axis=0)
# Clean up.
del X_train
del model
gc.collect()
return [
dropout_feature_list,
final_experiment_score[0],
final_experiment_score[1],
]
In [ ]:
all_experiments_log = []
In [ ]:
for i in range(NUM_SEARCH_ITERATIONS):
print(f'Iteration {i + 1} of {NUM_SEARCH_ITERATIONS}')
dropout_list = np.random.choice(
df_train.columns,
size=int(len(df_train.columns) * DROPOUT_FEATURE_FRACTION),
replace=False,
p=dropout_probs,
)
print(f'Removing {dropout_list}')
experiment_result = run_experiment(dropout_list)
_, result_train, result_val = experiment_result
print(f'Train: {result_train:.6f} Val: {result_val:.6f} Diff: {result_val - result_train:.6f}')
all_experiments_log.append(experiment_result)
pd.DataFrame(all_experiments_log).to_csv(project.temp_dir + 'dropout_experiments.log', index=None)
print()