Optimize Model Hyperparameters with SigOpt

Imports


In [ ]:
from pygoose import *

In [ ]:
import datetime
import pprint

In [ ]:
import lightgbm as lgb

In [ ]:
from sklearn.model_selection import StratifiedKFold

In [ ]:
from sigopt import Connection

Config


In [ ]:
project = kg.Project.discover()

Model-specific parameters.


In [ ]:
NUM_FOLDS = 5

SigOpt-specific parameters.

Specify an experiment ID only if you want to continue an existing experiment. Otherwise, a new one will be created.


In [ ]:
SIGOPT_EXPERIMENT_ID = None
SIGOPT_EXPERIMENT_NAME = 'Master LightGBM Model'
SIGOPT_TOKEN = 'YOUR_TOKEN_HERE'

NUM_OPTIMIZATION_ITERATIONS = 50

Make subsequent runs consistent and reproducible.


In [ ]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

Read Data


In [ ]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'tfidf',
    'lda',
    'nlp_tags',
    'wordnet_similarity',
    'phrase_embedding',
    'wmd',
    'wm_intersect',
    
    '3rdparty_abhishek',
    '3rdparty_dasolmar_whq',
    '3rdparty_mephistopheies',
    '3rdparty_image_similarity',
    
    'magic_pagerank',
    'magic_frequencies',
    'magic_cooccurrence_matrix',
    
    'oofp_nn_mlp_with_magic',
    'oofp_nn_cnn_with_magic',
    'oofp_nn_bi_lstm_with_magic',
    'oofp_nn_siamese_lstm_attention',
]

In [ ]:
df_train, df_test, _ = project.load_feature_lists(feature_lists)

In [ ]:
X_train = df_train.values
X_test = df_test.values

In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')

Partition the data


In [ ]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

Set up the experiment


In [ ]:
conn = Connection(client_token=SIGOPT_TOKEN)

In [ ]:
if SIGOPT_EXPERIMENT_ID:
    experiment = conn.experiments(id=SIGOPT_EXPERIMENT_ID).fetch()

else:
    experiment = conn.experiments().create(
        name=SIGOPT_EXPERIMENT_NAME,
        parameters=[
            dict(name='feature_fraction', type='double', bounds=(dict(min=0.1, max=1.0))),
            dict(name='lambda_l2', type='double', bounds=(dict(min=0.0, max=50.0))),
            dict(name='num_leaves', type='int', bounds=(dict(min=8, max=512))),
        ],
    )
    print("Created experiment: https://sigopt.com/experiment/" + experiment.id)

In [ ]:
def evaluate_model(candidate_params):
    cv_scores = []
    
    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
        X_fold_train = X_train[ix_train]
        X_fold_val = X_train[ix_val]

        y_fold_train = y_train[ix_train]
        y_fold_val = y_train[ix_val]

        lgb_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt',
            'device': 'cpu',
            'num_leaves': candidate_params['num_leaves'],
            'feature_fraction': candidate_params['feature_fraction'],
            'lambda_l2': candidate_params['lambda_l2'],
            'learning_rate': 0.03,
            'num_boost_round': 3000,
            'early_stopping_rounds': 5,
            'verbose': 1,
            'bagging_fraction_seed': RANDOM_SEED,
            'feature_fraction_seed': RANDOM_SEED,
        }

        lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
        lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
        evals_result = {}

        model = lgb.train(
            lgb_params,
            lgb_data_train,
            valid_sets=[lgb_data_train, lgb_data_val],
            evals_result=evals_result,
            num_boost_round=lgb_params['num_boost_round'],
            early_stopping_rounds=lgb_params['early_stopping_rounds'],
            verbose_eval=False,
        )

        fold_train_scores = evals_result['training'][lgb_params['metric']]
        fold_val_scores = evals_result['valid_1'][lgb_params['metric']]

        print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
            fold_num + 1,
            len(fold_train_scores),
            fold_train_scores[-1],
            fold_val_scores[-1],
        ))

        cv_scores.append(fold_val_scores[-1])
    
    return -np.mean(cv_scores)

In [ ]:
for i in range(NUM_OPTIMIZATION_ITERATIONS):
    print(f'Iteration {i + 1} of {NUM_OPTIMIZATION_ITERATIONS}')
    suggestion = conn.experiments(experiment.id).suggestions().create()
    
    print('Suggestion: ')
    pprint.pprint(suggestion.assignments)
    
    score = evaluate_model(suggestion.assignments)
    print(f'Score: {score:.6f}')
    print()
    
    conn.experiments(experiment.id).observations().create(
        suggestion=suggestion.id,
        value=score,
    )