In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv(r'.\input\application_train.csv', nrows= num_rows)
    test_df = pd.read_csv(r'.\input\application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    dropcolum=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',
    'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',
    'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 
    'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',
    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19',
    'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
    df= df.drop(dropcolum,axis=1)
    del test_df
    gc.collect()
    return df

# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('./input/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('./input/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = pd.merge(bureau,bb_agg, how='left',  left_index=True, right_index=True)
    #bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': [ 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': [ 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = pd.merge(bureau_agg,active_agg, how='left',  left_index=True, right_index=True)
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = pd.merge(bureau_agg,closed_agg, how='left',  left_index=True, right_index=True)
    #bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('./input/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': [ 'max', 'mean'],
        'AMT_APPLICATION': [ 'max','mean'],
        'AMT_CREDIT': [ 'max', 'mean'],
        'APP_CREDIT_PERC': [ 'max', 'mean'],
        'AMT_DOWN_PAYMENT': [ 'max', 'mean'],
        'AMT_GOODS_PRICE': [ 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': [ 'max', 'mean'],
        'RATE_DOWN_PAYMENT': [ 'max', 'mean'],
        'DAYS_DECISION': [ 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = pd.merge(prev_agg,approved_agg, how='left',  left_index=True, right_index=True)
    #prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = pd.merge(prev_agg,refused_agg, how='left',  left_index=True, right_index=True)
    #prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('./input/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('./input/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum','min','std' ],
        'DBD': ['max', 'mean', 'sum','min','std'],
        'PAYMENT_PERC': [ 'max','mean',  'var','min','std'],
        'PAYMENT_DIFF': [ 'max','mean', 'var','min','std'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum','min','std'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum','std'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum','std']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('./input/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg([ 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.01,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
            )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [2]:
from hyperopt import STATUS_OK
import csv
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
import lightgbm as lgb
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin

# Record results
trials = Trials()

# Create the algorithm
tpe_algorithm = tpe.suggest

1) A Trials object that stores the dictionary returned from the objective function

2) The optimization algorithm is the method for constructing the surrogate function (probability model) and selecting the next set of hyperparameters to evaluate in the objective function. Hyperopt has two choices: random search and Tree Parzen Estimator.


In [3]:
def objective(hyperparameters ):
    
    global ITERATION
    global df
    
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]    
    #del df
    gc.collect()
    N_FOLDS = 5
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    train_set = lgb.Dataset(train_df[feats], train_df['TARGET'])
    
    #global  ITERATION
    ITERATION += 1

    # Using early stopping to find number of trees trained
    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']

    # Retrieve the subsample
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
        
    # Extract the boosting type and subsample to top level keys
    hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
    hyperparameters['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])
    print("Starting LightGBM tuning. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)
 
    
    # Extract the best score
    best_score = cv_results['auc-mean'][-1]
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = len(cv_results['auc-mean'])
    
    # Add the number of estimators to the hyperparameters
    hyperparameters['n_estimators'] = n_estimators
    OUT_FILE = 'bayes_test2.csv'
    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION,  best_score])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'status': STATUS_OK}

In [4]:
import ast

def evaluate(results, name):
    """Evaluate model on test data using hyperparameters in results
       Return dataframe of hyperparameters"""
    
    new_results = results.copy()
    # String to dictionary
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
    # Sort with best values on top
    new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)
    
    # Print out cross validation high score
    print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format(name, new_results.loc[0, 'score'], new_results.loc[0, 'iteration']))
    
    # Use best hyperparameters to create a model
    hyperparameters = new_results.loc[0, 'hyperparameters']
    model = lgb.LGBMClassifier(**hyperparameters)
    
    # Train and make predictions
    model.fit(train_features, train_labels)
    preds = model.predict_proba(test_features)[:, 1]
    
    print('ROC AUC from {} on test data = {:.5f}.'.format(name, roc_auc_score(test_labels, preds)))
    
    # Create dataframe of hyperparameters
    hyp_df = pd.DataFrame(columns = list(new_results.loc[0, 'hyperparameters'].keys()))

    # Iterate through each set of hyperparameters that were evaluated
    for i, hyp in enumerate(new_results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]), 
                               ignore_index = True)
        
    # Put the iteration and score in the hyperparameter dataframe
    hyp_df['iteration'] = new_results['iteration']
    hyp_df['score'] = new_results['score']
    
    return hyp_df

In [5]:
def main(debug = False):
    num_rows = 10000 if debug else None
    global df, ITERATION
    
    df = application_train_test(num_rows)
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        gc.collect()
    with timer("Run LightGBM with kfold"):
        #feat_importance = kfold_lightgbm(df, num_folds= 5, stratified= False, debug= debug)
        space = {
        'boosting_type': hp.choice('boosting_type', 
                                                [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                                 {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)} ]),
        'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
        'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
        'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.6),
        'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.6),
        'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
        'is_unbalance': hp.choice('is_unbalance', [True, False]),
        }
        x = sample(space)

        # Conditional logic to assign top-level keys
        subsample = x['boosting_type'].get('subsample', 1.0)
        x['boosting_type'] = x['boosting_type']['boosting_type']
        x['subsample'] = subsample
        
        MAX_EVALS = 501
        
        # Create a new file and open a connection
        OUT_FILE = 'bayesian_trials_1000.csv'
        of_connection = open(OUT_FILE, 'w')
        writer = csv.writer(of_connection)
        
        
        ITERATION = 0

        # Write column names
        headers = ['loss', 'hyperparameters', 'iteration',   'score']
        writer.writerow(headers)
        of_connection.close()
        
        trials = Trials()
        
        best = fmin(fn = objective, space = space, algo = tpe.suggest,  trials = trials, max_evals = MAX_EVALS)
        trials_dict = sorted(trials.results, key = lambda x: x['loss'])
        # Test the objective function
        results = objective(df, sample(space), ITERATION)
        
        print('Finished, best results')
        print(trials_dict[:1])
        
         # Save the trial results
        with open('trials.json', 'w') as f:
            f.write(json.dumps(trials_dict))
        bayes_results = pd.read_csv('bayesian_trials_1000.csv').sort_values('score', ascending = False).reset_index()
        bayes_params = evaluate(bayes_results, name = 'Bayesian')
        
        # Plot of scores over the course of searching
        best_bayes_params = bayes_params.iloc[bayes_params['score'].idxmax(), :].copy()
        sns.lmplot('iteration', 'ROC AUC', hue = 'search', data = scores, size = 8);
        plt.scatter(best_bayes_params['iteration'], best_bayes_params['score'], marker = '*', s = 400, c = 'orange', edgecolor = 'k')
        plt.xlabel('Iteration'); plt.ylabel('ROC AUC'); plt.title("Validation ROC AUC versus Iteration")

if __name__ == "__main__":
    global df,ITERATION
    submission_file_name = "submission_kernel03.csv"
    with timer("Full model run"):
        global df,ITERATION
        main()


Train samples: 307511, test samples: 48744
Bureau df shape: (305811, 95)
Process bureau and bureau_balance - done in 22s
Previous applications df shape: (338857, 219)
Process previous_applications - done in 23s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 15s
Installments payments df shape: (339587, 36)
Process installments payments - done in 33s
Credit card balance df shape: (103558, 113)
Process credit card balance - done in 18s
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
Starting LightGBM tuning. Train shape: (307507, 721), test shape: (48744, 721)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-2b11848bccaf> in <module>()
    101     with timer("Full model run"):
    102         global df,ITERATION
--> 103         main()

<ipython-input-5-2b11848bccaf> in main(debug)
     76         trials = Trials()
     77 
---> 78         best = fmin(fn = objective, space = space, algo = tpe.suggest,  trials = trials, max_evals = MAX_EVALS)
     79         trials_dict = sorted(trials.results, key = lambda x: x['loss'])
     80         # Test the objective function

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    305             verbose=verbose,
    306             catch_eval_exceptions=catch_eval_exceptions,
--> 307             return_argmin=return_argmin,
    308         )
    309 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\base.py in fmin(self, fn, space, algo, max_evals, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin)
    633             pass_expr_memo_ctrl=pass_expr_memo_ctrl,
    634             catch_eval_exceptions=catch_eval_exceptions,
--> 635             return_argmin=return_argmin)
    636 
    637 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    318                     verbose=verbose)
    319     rval.catch_eval_exceptions = catch_eval_exceptions
--> 320     rval.exhaust()
    321     if return_argmin:
    322         return trials.argmin

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\fmin.py in exhaust(self)
    197     def exhaust(self):
    198         n_done = len(self.trials)
--> 199         self.run(self.max_evals - n_done, block_until_done=self.async)
    200         self.trials.refresh()
    201         return self

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\fmin.py in run(self, N, block_until_done)
    171             else:
    172                 # -- loop over trials and do the jobs directly
--> 173                 self.serial_evaluate()
    174 
    175             if stopped:

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\fmin.py in serial_evaluate(self, N)
     90                 ctrl = base.Ctrl(self.trials, current_trial=trial)
     91                 try:
---> 92                     result = self.domain.evaluate(spec, ctrl)
     93                 except Exception as e:
     94                     logger.info('job exception: %s' % str(e))

~\AppData\Local\Continuum\miniconda3\lib\site-packages\hyperopt\base.py in evaluate(self, config, ctrl, attach_attachments)
    838                 memo=memo,
    839                 print_node_on_error=self.rec_eval_print_node_on_error)
--> 840             rval = self.fn(pyll_rval)
    841 
    842         if isinstance(rval, (float, int, np.number)):

<ipython-input-3-d545f37ad4a9> in objective(hyperparameters)
     33 
     34     cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
---> 35                         early_stopping_rounds = 100, metrics = 'auc', seed = 50)
     36 
     37 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\lightgbm\engine.py in cv(params, train_set, num_boost_round, folds, nfold, stratified, shuffle, metrics, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, fpreproc, verbose_eval, show_stdv, seed, callbacks)
    445                                     end_iteration=num_boost_round,
    446                                     evaluation_result_list=None))
--> 447         cvfolds.update(fobj=fobj)
    448         res = _agg_cv_result(cvfolds.eval_valid(feval))
    449         for _, key, mean, _, std in res:

~\AppData\Local\Continuum\miniconda3\lib\site-packages\lightgbm\engine.py in handlerFunction(*args, **kwargs)
    244             ret = []
    245             for booster in self.boosters:
--> 246                 ret.append(getattr(booster, name)(*args, **kwargs))
    247             return ret
    248         return handlerFunction

~\AppData\Local\Continuum\miniconda3\lib\site-packages\lightgbm\basic.py in update(self, train_set, fobj)
   1519             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
   1520                 self.handle,
-> 1521                 ctypes.byref(is_finished)))
   1522             self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
   1523             return is_finished.value == 1

KeyboardInterrupt: 
https://www.kaggle.com/alexandrnikitin/xgboost-hyperparameter-optimization

In [6]:
import pandas as pd
bayes_results = pd.read_csv('bayes_test2_temp.csv').sort_values('score', ascending = False).reset_index()   
new_results = bayes_results.copy()

# String to dictionary
new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
# Sort with best values on top
new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)
    
# Print out cross validation high score
print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format('bayes optimiz', new_results.loc[0, 'score'], new_results.loc[0, 'iteration']))
list(bayes_results[bayes_results.iteration == 47].hyperparameters)


The highest cross validation score from bayes optimiz was 0.78831 found on iteration 47.
Out[6]:
["{'boosting_type': 'gbdt', 'colsample_bytree': 0.6014642048462456, 'is_unbalance': False, 'learning_rate': 0.01222878983248616, 'min_child_samples': 500, 'num_leaves': 126, 'reg_alpha': 0.07936876591402212, 'reg_lambda': 0.28430717120489335, 'subsample_for_bin': 160000, 'subsample': 0.619083596793821, 'metric': 'auc', 'verbose': 1, 'n_estimators': 1115}"]