In [ ]:
from pygoose import *
In [ ]:
import datetime
import pprint
In [ ]:
import lightgbm as lgb
In [ ]:
from sklearn.model_selection import StratifiedKFold
In [ ]:
from sigopt import Connection
In [ ]:
project = kg.Project.discover()
Model-specific parameters.
In [ ]:
NUM_FOLDS = 5
SigOpt-specific parameters.
Specify an experiment ID only if you want to continue an existing experiment. Otherwise, a new one will be created.
In [ ]:
SIGOPT_EXPERIMENT_ID = None
SIGOPT_EXPERIMENT_NAME = 'Master LightGBM Model'
SIGOPT_TOKEN = 'YOUR_TOKEN_HERE'
NUM_OPTIMIZATION_ITERATIONS = 50
Make subsequent runs consistent and reproducible.
In [ ]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
In [ ]:
feature_lists = [
'simple_summaries',
'jaccard_ngrams',
'fuzzy',
'tfidf',
'lda',
'nlp_tags',
'wordnet_similarity',
'phrase_embedding',
'wmd',
'wm_intersect',
'3rdparty_abhishek',
'3rdparty_dasolmar_whq',
'3rdparty_mephistopheies',
'3rdparty_image_similarity',
'magic_pagerank',
'magic_frequencies',
'magic_cooccurrence_matrix',
'oofp_nn_mlp_with_magic',
'oofp_nn_cnn_with_magic',
'oofp_nn_bi_lstm_with_magic',
'oofp_nn_siamese_lstm_attention',
]
In [ ]:
df_train, df_test, _ = project.load_feature_lists(feature_lists)
In [ ]:
X_train = df_train.values
X_test = df_test.values
In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
In [ ]:
kfold = StratifiedKFold(
n_splits=NUM_FOLDS,
shuffle=True,
random_state=RANDOM_SEED
)
In [ ]:
conn = Connection(client_token=SIGOPT_TOKEN)
In [ ]:
if SIGOPT_EXPERIMENT_ID:
experiment = conn.experiments(id=SIGOPT_EXPERIMENT_ID).fetch()
else:
experiment = conn.experiments().create(
name=SIGOPT_EXPERIMENT_NAME,
parameters=[
dict(name='feature_fraction', type='double', bounds=(dict(min=0.1, max=1.0))),
dict(name='lambda_l2', type='double', bounds=(dict(min=0.0, max=50.0))),
dict(name='num_leaves', type='int', bounds=(dict(min=8, max=512))),
],
)
print("Created experiment: https://sigopt.com/experiment/" + experiment.id)
In [ ]:
def evaluate_model(candidate_params):
cv_scores = []
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
X_fold_train = X_train[ix_train]
X_fold_val = X_train[ix_val]
y_fold_train = y_train[ix_train]
y_fold_val = y_train[ix_val]
lgb_params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'device': 'cpu',
'num_leaves': candidate_params['num_leaves'],
'feature_fraction': candidate_params['feature_fraction'],
'lambda_l2': candidate_params['lambda_l2'],
'learning_rate': 0.03,
'num_boost_round': 3000,
'early_stopping_rounds': 5,
'verbose': 1,
'bagging_fraction_seed': RANDOM_SEED,
'feature_fraction_seed': RANDOM_SEED,
}
lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)
evals_result = {}
model = lgb.train(
lgb_params,
lgb_data_train,
valid_sets=[lgb_data_train, lgb_data_val],
evals_result=evals_result,
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
verbose_eval=False,
)
fold_train_scores = evals_result['training'][lgb_params['metric']]
fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
fold_num + 1,
len(fold_train_scores),
fold_train_scores[-1],
fold_val_scores[-1],
))
cv_scores.append(fold_val_scores[-1])
return -np.mean(cv_scores)
In [ ]:
for i in range(NUM_OPTIMIZATION_ITERATIONS):
print(f'Iteration {i + 1} of {NUM_OPTIMIZATION_ITERATIONS}')
suggestion = conn.experiments(experiment.id).suggestions().create()
print('Suggestion: ')
pprint.pprint(suggestion.assignments)
score = evaluate_model(suggestion.assignments)
print(f'Score: {score:.6f}')
print()
conn.experiments(experiment.id).observations().create(
suggestion=suggestion.id,
value=score,
)