Here we compare CatBoost, LightGBM and XGBoost for shap values calculations. All boosting algorithms were trained on GPU but shap evaluation was on CPU.
We use the epsilon_normalized dataset from here.
In [1]:
import os
import numpy as np
import scipy
import pandas as pd
import copy
import tqdm
import datetime
from sklearn import datasets
import catboost
import xgboost as xgb
import lightgbm as lgb
import time
In [2]:
catboost.__version__, lgb.__version__, xgb.__version__
Out[2]:
In [3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized")
test_data, test_target = datasets.load_svmlight_file("epsilon_normalized.t",)
In [4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = '0' # specify your GPU (used only for training)
random_state = 0
In [5]:
train_target[train_target == -1] = 0
test_target[test_target == -1] = 0
In [6]:
def preprocess_data(data, label=None, mode='train', boosting=None):
assert boosting is not None
if boosting == 'xgboost':
return xgb.DMatrix(data, label)
elif boosting == 'lightgbm':
if mode == 'train':
return lgb.Dataset(data, label)
else:
return data
elif boosting == 'catboost':
data = catboost.FeaturesData(num_feature_data=data)
return catboost.Pool(data, label)
else:
raise RuntimeError("Unknown boosting library")
In [7]:
def create_parameters(base_params, boosting=None, **kwargs):
assert boosting is not None
assert isinstance(base_params, dict)
params = copy.copy(base_params)
if boosting == 'xgboost':
params['objective'] = 'binary:logistic'
params['max_depth'] = kwargs['depth']
params['tree_method'] = 'gpu_hist'
params['gpu_id'] = gpu_device
elif boosting == 'lightgbm':
params['objective'] = 'binary'
params['device'] = "gpu"
params['gpu_device_id'] = gpu_device
params['num_leaves'] = 2**kwargs['depth']
elif boosting == 'catboost':
params['objective'] = 'Logloss'
params['task_type'] = 'GPU'
params['devices'] = gpu_device
params['bootstrap_type'] = 'Bernoulli'
params['logging_level'] = 'Silent'
else:
raise RuntimeError("Unknown boosting library")
return params
In [8]:
def train(data, params, num_iters, boosting=None):
assert boosting is not None
if boosting == 'xgboost':
return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
elif boosting == 'lightgbm':
return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
elif boosting == 'catboost':
return catboost.train(pool=data, params=params, num_boost_round=num_iters)
else:
raise RuntimeError("Unknown boosting library")
In [9]:
def predict_shap(model, data, boosting=None):
assert boosting is not None
if boosting == 'xgboost':
return model.predict(data, pred_contribs=True)
elif boosting == 'lightgbm':
return model.predict(data, pred_contrib=True)
elif boosting == 'catboost':
return model.get_feature_importance(data, fstr_type='ShapValues')
In [10]:
def create_path(boosting, params):
fname = [boosting]
for key, value in sorted(params.items()):
fname.append(str(key))
fname.append(str(value))
fname = "_".join(fname)
fname = fname.replace(".", '')
fname += ".model"
return fname
In [11]:
def load_model(fname, boosting):
if boosting == "xgboost":
bst = xgb.Booster(model_file=fname)
bst.load_model(fname)
elif boosting == "lightgbm":
bst = lgb.Booster(model_file=fname)
elif boosting == "catboost":
bst = catboost.CatBoost()
bst.load_model(fname)
else:
raise RuntimeError("Unknown boosting")
return bst
In [12]:
base_params = {
'learning_rate': lr,
'max_bin': max_bin,
'random_state': random_state
}
In [13]:
result = []
boosting_list = ['xgboost', 'catboost', 'lightgbm']
depth_list = [2, 4, 6, 8, 10]
lens_list = [1000, 5000, 10000]
for gb_type in boosting_list:
print("{} is going".format(gb_type))
for size_test in lens_list:
print("size test {}".format(size_test))
sep_test_data = test_data[:size_test]
sep_test_target = test_target[:size_test]
# comment this line if you have already trained all models
train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)
dense_test = sep_test_data.todense().A.astype(np.float32)
for depth in tqdm.tqdm(depth_list):
start_test_preproc = datetime.datetime.now()
test_preprocessed = preprocess_data(dense_test,
sep_test_target,
mode='test',
boosting=gb_type)
finish_test_preproc = datetime.datetime.now()
preprocessing_delta = finish_test_preproc - start_test_preproc
preprocessing_delta = preprocessing_delta.total_seconds()
params = create_parameters(base_params, boosting=gb_type, depth=depth)
params['depth'] = depth
fname = create_path(gb_type, params)
if os.path.exists(fname):
print("model exist")
bst = load_model(fname, boosting=gb_type)
else:
print("model is training")
start_train = datetime.datetime.now()
bst = train(train_preprocessed, params, num_iters=num_iters, boosting=gb_type)
finish_train = datetime.datetime.now()
delta_train = finish_train - start_train
delta_train = int(delta_train.total_seconds() * 1000)
bst.save_model(fname)
start_time = datetime.datetime.now()
preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)
finish_time = datetime.datetime.now()
delta = finish_time - start_time
delta = delta.total_seconds()
current_res = {
'preprocessing_time': preprocessing_delta,
'boosting': gb_type,
'test_size': size_test,
'depth': depth,
'time': delta,
}
result.append(current_res)
print("*" * 40)
In [14]:
result_df = pd.DataFrame(result)
In [16]:
result_df.to_csv("shap_benchmark_{}_max_bin_with_test_sizes.csv".format(max_bin), index=False)
In [17]:
result_df = pd.read_csv("shap_benchmark_128_max_bin_with_test_sizes.csv", )
result_df.pivot_table(index=["test_size", "depth"], columns="boosting", values="time")
Out[17]:
In [18]:
result_df.pivot_table(index="test_size", columns="boosting", values="preprocessing_time")
Out[18]:
In [ ]: