In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import xgboost 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from functools import partial

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

import sys
sys.path.insert(0,'..')

from vf_portalytics.model import PredictionModel
from vf_portalytics.tool import squared_error_objective_with_weighting, get_categorical_features
from vf_portalytics.transformers import get_transformer
from vf_portalytics.multi_model import MultiModel

Generate data


In [2]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
    x, y = make_regression(
        n_samples=n_samples, 
        n_features=n_features,
        noise=0.5,
        n_informative=n_informative, 
        random_state=0
    )
    x = pd.DataFrame(x)
    
    x.columns = ['feature_' + str(i) for i in range(n_features)]
    x = x.assign(**kwargs)
    return x, pd.Series(y, name='target')


# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=400, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=280, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=320, n_features=n_features, n_informative=12, category='D')

# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)

# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
    bins[i] = [-np.inf, 
               total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')

In [3]:
# Overview of dataset
print (total_x.head().T)


                   0         1          2          3          4
feature_0         g2        g2         g1         g2         g3
feature_1         g2        g2         g2         g3         g2
feature_2  -0.587016  0.910646    1.92655    0.14966   0.192616
feature_3   0.363698   1.71483  -0.800615  -0.351122   0.439522
feature_4     1.0723  -1.64805   0.439618  0.0737539   0.763541
feature_5   -1.32053  0.447212   0.447113  -0.168771 -0.0679451
feature_6   0.499084 -0.616121    2.53691  -0.389912    2.36023
feature_7  -0.739169 -0.397912    1.28298   0.151256  -0.100302
feature_8    0.57095   1.68579  -0.629217   0.817049  -0.489482
feature_9   0.312634 -0.832186    1.07982   -1.34878   0.306272
feature_10  0.199397  0.150818    -0.2299  -0.140706  -0.716444
feature_11 -0.177247 -0.356872   -1.30817   -1.01877   -1.00755
feature_12 -0.379554  0.545657    1.84741   0.134662   0.566869
feature_13 -0.155898   1.39856  -0.378736  -0.392786    2.16001
feature_14 -0.950518  -1.72323  -0.755243    1.62882   0.302561
feature_15 -0.503709  0.298714 -0.0836469   0.598213   -1.10907
feature_16    1.2413  -1.64922    0.28831   0.369657  -0.966063
feature_17 -0.463996   1.79754   0.139162  -0.346419    1.09234
feature_18   1.94576 -0.255471   0.688946  -0.260977  -0.506274
feature_19  0.306203   0.46629   0.180524   0.289598   0.555546
category           A         A          A          A          A

Declare group parameters


In [4]:
# Declare basic parameters
target = 'target'
cat_feature = 'category'

#feature_col_list = df.columns.drop(cat_feature)
clusters = total_x[cat_feature].unique()
clusters


Out[4]:
array(['A', 'B', 'C', 'D'], dtype=object)

Filtering

...


In [5]:
# split data into train, validation and test set
train_x, test_x, train_y, test_y = train_test_split(total_x, total_y, test_size=0.2, random_state=1)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)

del x1, x2, x3, x4
del y1, y2, y3, y4
del total_x, total_y

Feature selection


In [6]:
# Feature Selection is being done seperately and categorical features are split to nominals and ordinals,
remaining_feat = train_x.columns # feature_info_dict['remaining_feat'] 
remaining_feat = remaining_feat.drop(cat_feature)

ordinal_features = ['feature_0', 'feature_1'] # feature_info_dict['ordinal_features'] 
nominal_features = [] # feature_info_dict['nominal_features'] 

selected_features = {}
for cluster in clusters:
    selected_features[cluster] = remaining_feat

Hyper Parameter tuning


In [7]:
# space can be different for each group but let this for the future if it is needed
space={
    'max_depth': hp.choice('max_depth', np.arange(2, 6, dtype = int)),
    'subsample': hp.quniform('subsample', 0.4, 0.6, 0.05), 
    'min_child_weight': hp.quniform ('min_child_weight', 1, 20, 1),
    'gamma' : hp.quniform('gamma', 0.7, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.4, 0.6, 0.05), 
    'learning_rate' : hp.quniform('learning_rate', 0.001, 0.1, 0.01), 
    'transformer_nominal': hp.choice('transformer_nominal', ['TargetEncoder', 'JamesSteinEncoder']),
    'transformer_ordinal': hp.choice('transformer_ordinal', ['OrdinalEncoder']),
    'under_predict_weight': hp.choice('under_predict_weight', [2.0, 2.5, 3.0]),
    'reg_alpha' : hp.quniform('reg_alpha', 0.5, 1.0, 0.05), 
    'reg_lambda' : hp.quniform('reg_lambda', 1.0, 1.5, 0.05)
}

def score(params, train_x_group, train_y_group, val_x_group, val_y_group):
        
    categorical_features = get_categorical_features(data=train_x_group)

    # preprocess ordinals
    transformer_ordinal = get_transformer('OrdinalEncoder')
    gp_ordinal = [feature for feature in categorical_features if feature in ordinal_features]
    transformer_ordinal.cols = gp_ordinal

    # preprocess nominals
    transformer_nominal = get_transformer('TargetEncoder')
    gp_nominals = [feature for feature in categorical_features if feature in nominal_features or feature not in gp_ordinal]
    transformer_nominal.cols = gp_nominals
    assert set(gp_nominals + gp_ordinal) == set(categorical_features)

    gbm_model = xgboost.XGBRegressor(n_estimators = 1000, 
                                     objective = partial(squared_error_objective_with_weighting, 
                                                         under_predict_weight=params['under_predict_weight']), 
                                     max_depth = params['max_depth'],
                                     subsample = params['subsample'],
                                     min_child_weight = params['min_child_weight'],
                                     gamma = params['gamma'],
                                     colsample_bytree = params['colsample_bytree'],
                                     learning_rate = params['learning_rate'],
                                     reg_alpha = params['reg_alpha'],
                                     reg_lambda = params['reg_lambda'],                                    
                                     n_jobs = 8,
                                     seed = 1234,
                                     silent=True)
    
    pipeline = Pipeline([('transformer_ordinal', transformer_ordinal), 
                     ('transformer_nominal', transformer_nominal), 
                     ('estimator', gbm_model)])

    pipe_trf = Pipeline(pipeline.steps[:-1])
    pipe_trf = pipe_trf.fit(train_x_group, train_y_group)
    eval_set = [(pipe_trf.transform(train_x_group), train_y_group), (pipe_trf.transform(val_x_group), val_y_group)]
    eval_metric = ["mae"]

    pipeline.fit(train_x_group, train_y_group, 
                 estimator__early_stopping_rounds=30, 
                 estimator__eval_set=eval_set, 
                 estimator__eval_metric=eval_metric,
                 estimator__verbose=False)
    
    n_estimators = pipeline.named_steps['estimator'].best_iteration
    params['n_estimators'] = n_estimators
    evals_result = pipeline.named_steps['estimator'].evals_result()
    loss = evals_result['validation_1'][eval_metric[0]][n_estimators]
    
    return {'loss' : loss, 'status' : STATUS_OK, 'n_estimators': n_estimators}


def optimize(space, train_x_group, train_y_group, val_x_group, val_y_group, gp_key):
    trials = Trials()
    fmin_objective = partial(score, train_x_group=train_x_group, train_y_group=train_y_group, 
                             val_x_group=val_x_group, val_y_group=val_y_group)

    best = fmin(fn=fmin_objective, 
                space=space, 
                algo=tpe.suggest, 
                max_evals=20, 
                trials=trials
               )
    return space_eval(space, best), trials

In [8]:
groups = train_x.groupby(cat_feature)
params = {}
for gp_key, group in groups:
    print('Checking ' + gp_key + ' ...')
    # keep only the most improtant features
    train_x_group = group[list(selected_features[gp_key])]
    train_y_group = train_y[train_x_group.index]
    # validation set
    val_x_group = val_x[val_x[cat_feature]==gp_key]
    val_x_group = val_x_group[list(selected_features[gp_key])]
    val_y_group = val_y[val_x_group.index]
    # find best parameters for each model-group
    
    best_params, trials = optimize(space, 
                                   train_x_group, train_y_group, 
                                   val_x_group, val_y_group, 
                                   gp_key)
    params[gp_key] = best_params
    params[gp_key]['n_estimators'] = trials.best_trial['result']['n_estimators']
    
# in the end we keep params; a dictionary with keys the group names and values dictionaries of the selected hyperparameters


Checking A ...
100%|██████████| 20/20 [00:22<00:00,  1.13s/trial, best loss: 52.399776]
Checking B ...
100%|██████████| 20/20 [00:06<00:00,  2.90trial/s, best loss: 61.088593]
Checking C ...
100%|██████████| 20/20 [00:17<00:00,  1.14trial/s, best loss: 50.475033]
Checking D ...
100%|██████████| 20/20 [00:11<00:00,  1.68trial/s, best loss: 60.802723]

Train and validate the model


In [9]:
# concatenate train and validation set before training the model
train_x = pd.concat([train_x, val_x], ignore_index=True)
train_y = pd.concat([train_y, val_y], ignore_index=True)
del val_x, val_y

In [10]:
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params, selected_features=selected_features,
                  nominals=nominal_features, ordinals=ordinal_features)

model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)


Model for A trained
Model for B trained
Model for C trained
Model for D trained

In [11]:
print('Train performance {}'.format(round(r2_score(train_y, pred_train_y), 2)))
print('Validation performance {}'.format(round(r2_score(test_y, pred_test_y), 2)))


Train performance 0.99
Validation performance 0.8

Train final model and save


In [12]:
# train with the whole dataset! 
# Initiliaze model
# combine into one dataset
total_x = pd.concat([train_x, test_x], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([train_y, test_y], axis=0, ignore_index=True).reset_index(drop=True)
del train_x, train_y, test_x, test_y

# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
                   selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)

In [13]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
prediction_model = PredictionModel("multi_model", path='./exported_models', one_hot_encode=False)
prediction_model.model = model
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
prediction_model.features = {key: [] for key in total_x.columns}
prediction_model.target = {target: []}

prediction_model.ordered_column_list = sorted(total_x.columns)

In [14]:
prediction_model.model.fit(total_x, total_y)


Model for A trained
Model for B trained
Model for C trained
Model for D trained
Out[14]:
MultiModel(clusters=array(['A', 'B', 'C', 'D'], dtype=object),
      group_col='category', nominals=[],
      ordinals=['feature_0', 'feature_1'],
      params={'A': {'n_estimators': 266, 'reg_alpha': 0.9, 'under_predict_weight': 3.0, 'colsample_bytree': 0.55, 'learning_rate': 0.07, 'min_child_weight': 10.0, 'transformer_ordinal': 'OrdinalEncoder', 'subsample': 0.55, 'reg_lambda': 1.05, 'transformer_nominal': 'TargetEncoder', 'max_depth': 2, 'gamma'...bda': 1.4500000000000002, 'transformer_nominal': 'JamesSteinEncoder', 'max_depth': 2, 'gamma': 0.9}},
      selected_features={'A': Index([u'feature_0', u'feature_1', u'feature_2', u'feature_3', u'feature_4',
       u'feature_5', u'feature_6', u'feature_7', u'feature_8', u'feature_9',
       u'feature_10', u'feature_11', u'feature_12', u'feature_13',
       u'feature_14', u'feature_15', u'feature_16', u'f...ture_15', u'feature_16', u'feature_17',
       u'feature_18', u'feature_19'],
      dtype='object')})