In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
import xgboost
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from functools import partial
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import sys
sys.path.insert(0,'..')
from vf_portalytics.model import PredictionModel
from vf_portalytics.tool import squared_error_objective_with_weighting, get_categorical_features
from vf_portalytics.transformers import get_transformer
from vf_portalytics.multi_model import MultiModel
In [2]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
x, y = make_regression(
n_samples=n_samples,
n_features=n_features,
noise=0.5,
n_informative=n_informative,
random_state=0
)
x = pd.DataFrame(x)
x.columns = ['feature_' + str(i) for i in range(n_features)]
x = x.assign(**kwargs)
return x, pd.Series(y, name='target')
# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=400, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=280, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=320, n_features=n_features, n_informative=12, category='D')
# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)
# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
bins[i] = [-np.inf,
total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(),
total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(),
total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')
In [3]:
# Overview of dataset
print (total_x.head().T)
In [4]:
# Declare basic parameters
target = 'target'
cat_feature = 'category'
#feature_col_list = df.columns.drop(cat_feature)
clusters = total_x[cat_feature].unique()
clusters
Out[4]:
In [5]:
# split data into train, validation and test set
train_x, test_x, train_y, test_y = train_test_split(total_x, total_y, test_size=0.2, random_state=1)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
del x1, x2, x3, x4
del y1, y2, y3, y4
del total_x, total_y
In [6]:
# Feature Selection is being done seperately and categorical features are split to nominals and ordinals,
remaining_feat = train_x.columns # feature_info_dict['remaining_feat']
remaining_feat = remaining_feat.drop(cat_feature)
ordinal_features = ['feature_0', 'feature_1'] # feature_info_dict['ordinal_features']
nominal_features = [] # feature_info_dict['nominal_features']
selected_features = {}
for cluster in clusters:
selected_features[cluster] = remaining_feat
In [7]:
# space can be different for each group but let this for the future if it is needed
space={
'max_depth': hp.choice('max_depth', np.arange(2, 6, dtype = int)),
'subsample': hp.quniform('subsample', 0.4, 0.6, 0.05),
'min_child_weight': hp.quniform ('min_child_weight', 1, 20, 1),
'gamma' : hp.quniform('gamma', 0.7, 1, 0.05),
'colsample_bytree' : hp.quniform('colsample_bytree', 0.4, 0.6, 0.05),
'learning_rate' : hp.quniform('learning_rate', 0.001, 0.1, 0.01),
'transformer_nominal': hp.choice('transformer_nominal', ['TargetEncoder', 'JamesSteinEncoder']),
'transformer_ordinal': hp.choice('transformer_ordinal', ['OrdinalEncoder']),
'under_predict_weight': hp.choice('under_predict_weight', [2.0, 2.5, 3.0]),
'reg_alpha' : hp.quniform('reg_alpha', 0.5, 1.0, 0.05),
'reg_lambda' : hp.quniform('reg_lambda', 1.0, 1.5, 0.05)
}
def score(params, train_x_group, train_y_group, val_x_group, val_y_group):
categorical_features = get_categorical_features(data=train_x_group)
# preprocess ordinals
transformer_ordinal = get_transformer('OrdinalEncoder')
gp_ordinal = [feature for feature in categorical_features if feature in ordinal_features]
transformer_ordinal.cols = gp_ordinal
# preprocess nominals
transformer_nominal = get_transformer('TargetEncoder')
gp_nominals = [feature for feature in categorical_features if feature in nominal_features or feature not in gp_ordinal]
transformer_nominal.cols = gp_nominals
assert set(gp_nominals + gp_ordinal) == set(categorical_features)
gbm_model = xgboost.XGBRegressor(n_estimators = 1000,
objective = partial(squared_error_objective_with_weighting,
under_predict_weight=params['under_predict_weight']),
max_depth = params['max_depth'],
subsample = params['subsample'],
min_child_weight = params['min_child_weight'],
gamma = params['gamma'],
colsample_bytree = params['colsample_bytree'],
learning_rate = params['learning_rate'],
reg_alpha = params['reg_alpha'],
reg_lambda = params['reg_lambda'],
n_jobs = 8,
seed = 1234,
silent=True)
pipeline = Pipeline([('transformer_ordinal', transformer_ordinal),
('transformer_nominal', transformer_nominal),
('estimator', gbm_model)])
pipe_trf = Pipeline(pipeline.steps[:-1])
pipe_trf = pipe_trf.fit(train_x_group, train_y_group)
eval_set = [(pipe_trf.transform(train_x_group), train_y_group), (pipe_trf.transform(val_x_group), val_y_group)]
eval_metric = ["mae"]
pipeline.fit(train_x_group, train_y_group,
estimator__early_stopping_rounds=30,
estimator__eval_set=eval_set,
estimator__eval_metric=eval_metric,
estimator__verbose=False)
n_estimators = pipeline.named_steps['estimator'].best_iteration
params['n_estimators'] = n_estimators
evals_result = pipeline.named_steps['estimator'].evals_result()
loss = evals_result['validation_1'][eval_metric[0]][n_estimators]
return {'loss' : loss, 'status' : STATUS_OK, 'n_estimators': n_estimators}
def optimize(space, train_x_group, train_y_group, val_x_group, val_y_group, gp_key):
trials = Trials()
fmin_objective = partial(score, train_x_group=train_x_group, train_y_group=train_y_group,
val_x_group=val_x_group, val_y_group=val_y_group)
best = fmin(fn=fmin_objective,
space=space,
algo=tpe.suggest,
max_evals=20,
trials=trials
)
return space_eval(space, best), trials
In [8]:
groups = train_x.groupby(cat_feature)
params = {}
for gp_key, group in groups:
print('Checking ' + gp_key + ' ...')
# keep only the most improtant features
train_x_group = group[list(selected_features[gp_key])]
train_y_group = train_y[train_x_group.index]
# validation set
val_x_group = val_x[val_x[cat_feature]==gp_key]
val_x_group = val_x_group[list(selected_features[gp_key])]
val_y_group = val_y[val_x_group.index]
# find best parameters for each model-group
best_params, trials = optimize(space,
train_x_group, train_y_group,
val_x_group, val_y_group,
gp_key)
params[gp_key] = best_params
params[gp_key]['n_estimators'] = trials.best_trial['result']['n_estimators']
# in the end we keep params; a dictionary with keys the group names and values dictionaries of the selected hyperparameters
In [9]:
# concatenate train and validation set before training the model
train_x = pd.concat([train_x, val_x], ignore_index=True)
train_y = pd.concat([train_y, val_y], ignore_index=True)
del val_x, val_y
In [10]:
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params, selected_features=selected_features,
nominals=nominal_features, ordinals=ordinal_features)
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
In [11]:
print('Train performance {}'.format(round(r2_score(train_y, pred_train_y), 2)))
print('Validation performance {}'.format(round(r2_score(test_y, pred_test_y), 2)))
In [12]:
# train with the whole dataset!
# Initiliaze model
# combine into one dataset
total_x = pd.concat([train_x, test_x], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([train_y, test_y], axis=0, ignore_index=True).reset_index(drop=True)
del train_x, train_y, test_x, test_y
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)
In [13]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
prediction_model = PredictionModel("multi_model", path='./exported_models', one_hot_encode=False)
prediction_model.model = model
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
prediction_model.features = {key: [] for key in total_x.columns}
prediction_model.target = {target: []}
prediction_model.ordered_column_list = sorted(total_x.columns)
In [14]:
prediction_model.model.fit(total_x, total_y)
Out[14]: