Hyperparameter Optimization xgboost

What the options there're for tuning?

All right! Xgboost has about 20 params:

  1. base_score
  2. colsample_bylevel
  3. colsample_bytree
  4. gamma
  5. learning_rate
  6. max_delta_step
  7. max_depth
  8. min_child_weight
  9. missing
  10. n_estimators
  11. nthread
  12. objective
  13. reg_alpha
  14. reg_lambda
  15. scale_pos_weight
  16. seed
  17. silent
  18. subsample

Let's for tuning will be use 12 of them them with 5-10 possible values, so... there're 12^5 - 12^10 possible cases. If you will check one case in 10s, for 12^5 you need 30 days for 12^10 about 20K years :).

This is too long.. but there's a thid option - Bayesan optimisation.


In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import seaborn as sns

from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

%matplotlib inline

In [2]:
train = pd.read_csv('bike.csv')
train['datetime'] = pd.to_datetime( train['datetime'] )
train['day'] = train['datetime'].map(lambda x: x.day)

Modeling


In [6]:
def assing_test_samples(data, last_training_day=0.3, seed=1):
    days = data.day.unique()
    np.random.seed(seed)
    np.random.shuffle(days)
    test_days = days[: int(len(days) * 0.3)]
    
    data['is_test'] = data.day.isin(test_days)


def select_features(data):
    columns = data.columns[ (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].values    
    return [feat for feat in columns if feat not in ['count', 'casual', 'registered'] and 'log' not in feat ] 

def get_X_y(data, target_variable):
    features = select_features(data)
        
    X = data[features].values
    y = data[target_variable].values
    
    return X,y

def train_test_split(train, target_variable):
    df_train = train[train.is_test == False]
    df_test  = train[train.is_test == True]
    
    X_train, y_train = get_X_y(df_train, target_variable)
    X_test, y_test = get_X_y(df_test, target_variable)
    
    return X_train, X_test, y_train, y_test



def fit_and_predict(train, model, target_variable):
    X_train, X_test, y_train, y_test = train_test_split(train, target_variable)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return (y_test, y_pred)

def post_pred(y_pred):
    y_pred[y_pred < 0] = 0
    return y_pred

def rmsle(y_true, y_pred, y_pred_only_positive=True):
    if y_pred_only_positive: y_pred = post_pred(y_pred)
        
    diff = np.log(y_pred+1) - np.log(y_true+1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

assing_test_samples(train)

In [4]:
def etl_datetime(df):
    df['year'] = df['datetime'].map(lambda x: x.year)
    df['month'] = df['datetime'].map(lambda x: x.month)

    df['hour'] = df['datetime'].map(lambda x: x.hour)
    df['minute'] = df['datetime'].map(lambda x: x.minute)
    df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
    df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])

    
etl_datetime(train)

train['{0}_log'.format('count')] = train['count'].map(lambda x: np.log2(x) )

for name in ['registered', 'casual']:
    train['{0}_log'.format(name)] = train[name].map(lambda x: np.log2(x+1) )

Tuning hyperparmeters using Bayesian optimization algorithms


In [42]:
def objective(space):
    
    model = xgb.XGBRegressor(
        max_depth = space['max_depth'],
        n_estimators = int(space['n_estimators']),
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree'],
        learning_rate = space['learning_rate'],
        reg_alpha = space['reg_alpha']
    )

    X_train, X_test, y_train, y_test = train_test_split(train, 'count')
    eval_set  = [( X_train, y_train), ( X_test, y_test)]


    (_, registered_pred) = fit_and_predict(train, model, 'registered_log')
    (_, casual_pred) = fit_and_predict(train, model, 'casual_log')
   
    y_test = train[train.is_test == True]['count']
    y_pred = (np.exp2(registered_pred) - 1) + (np.exp2(casual_pred) -1)
    
    score = rmsle(y_test, y_pred)
    print "SCORE:", score

    return{'loss':score, 'status': STATUS_OK }

space ={
    'max_depth': hp.quniform("x_max_depth", 2, 20, 1),
    'n_estimators': hp.quniform("n_estimators", 100, 1000, 1),
    'subsample': hp.uniform ('x_subsample', 0.8, 1), 
    'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.1, 1), 
    'learning_rate': hp.uniform ('x_learning_rate', 0.01, 0.1), 
    'reg_alpha': hp.uniform ('x_reg_alpha', 0.1, 1)
}


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=15,
            trials=trials)

print(best)


SCORE: 0.327769943579
SCORE: 0.402119793524
SCORE: 0.441702998659
SCORE: 0.344952075056
SCORE: 0.332483052772
SCORE: 0.415230694098
SCORE: 0.326159133525
SCORE: 0.366755440868
SCORE: 0.336209948966
SCORE: 0.320813982928
SCORE: 0.33925039026
SCORE: 0.363387131966
SCORE: 0.324682064912
SCORE: 0.382678760754
SCORE: 0.488176057958
{'x_learning_rate': 0.0803514512536536, 'x_reg_alpha': 0.44303008763740737, 'n_estimators': 421.0, 'x_max_depth': 17.0, 'x_subsample': 0.9561807797584932, 'x_colsample_bytree': 0.8214374064161822}

In [ ]: