Speed comparison of gradient boosting libraries for shap values calculations

Here we compare CatBoost, LightGBM and XGBoost for shap values calculations. All boosting algorithms were trained on GPU but shap evaluation was on CPU.

We use the epsilon_normalized dataset from here.


In [1]:
import os
import numpy as np
import scipy
import pandas as pd
import copy
import tqdm
import datetime
from sklearn import datasets
import catboost
import xgboost as xgb
import lightgbm as lgb
import time

In [2]:
catboost.__version__, lgb.__version__, xgb.__version__


Out[2]:
('0.11.2', '2.2.2', '0.81')

In [3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized")
test_data, test_target = datasets.load_svmlight_file("epsilon_normalized.t",)

parameters


In [4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = '0' # specify your GPU (used only for training)
random_state = 0

In [5]:
train_target[train_target == -1] = 0
test_target[test_target == -1] = 0

In [6]:
def preprocess_data(data, label=None, mode='train', boosting=None):
    assert boosting is not None
    
    if boosting == 'xgboost':
        return xgb.DMatrix(data, label)
    elif boosting == 'lightgbm':
        if mode == 'train':
            return lgb.Dataset(data, label)
        else:
            return data
    elif boosting == 'catboost':
        data = catboost.FeaturesData(num_feature_data=data)
        return catboost.Pool(data, label)
    else:
        raise RuntimeError("Unknown boosting library")

In [7]:
def create_parameters(base_params, boosting=None, **kwargs):
    assert boosting is not None
    assert isinstance(base_params, dict)
    
    params = copy.copy(base_params)
    if boosting == 'xgboost':
        params['objective'] = 'binary:logistic'
        params['max_depth'] = kwargs['depth']
        params['tree_method'] = 'gpu_hist'
        params['gpu_id'] = gpu_device
    elif boosting == 'lightgbm':
        params['objective'] = 'binary'
        params['device'] = "gpu"
        params['gpu_device_id'] = gpu_device
        params['num_leaves'] = 2**kwargs['depth']
    elif boosting == 'catboost':
        params['objective'] = 'Logloss'
        params['task_type'] = 'GPU'
        params['devices'] = gpu_device
        params['bootstrap_type'] = 'Bernoulli'
        params['logging_level'] = 'Silent'
    else:
        raise RuntimeError("Unknown boosting library")
        
    return params

In [8]:
def train(data, params, num_iters, boosting=None):
    assert boosting is not None
    if boosting == 'xgboost':
        return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
    elif boosting == 'lightgbm':
        return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
    elif boosting == 'catboost':
        return catboost.train(pool=data, params=params, num_boost_round=num_iters)
    else:
        raise RuntimeError("Unknown boosting library")

In [9]:
def predict_shap(model, data, boosting=None):
    assert boosting is not None
    if boosting == 'xgboost':
        return model.predict(data, pred_contribs=True)
    elif boosting == 'lightgbm':
        return model.predict(data, pred_contrib=True)
    elif boosting == 'catboost':
        return model.get_feature_importance(data, fstr_type='ShapValues')

In [10]:
def create_path(boosting, params):
    fname = [boosting]
    for key, value in sorted(params.items()):
        fname.append(str(key))
        fname.append(str(value))
    fname = "_".join(fname)
    fname = fname.replace(".", '')
    fname += ".model"
    return fname

In [11]:
def load_model(fname, boosting):
    if boosting == "xgboost":
        bst = xgb.Booster(model_file=fname)
        bst.load_model(fname)
    elif boosting == "lightgbm":
        bst = lgb.Booster(model_file=fname)
    elif boosting == "catboost":
        bst = catboost.CatBoost()
        bst.load_model(fname)
    else:
        raise RuntimeError("Unknown boosting")
    return bst

In [12]:
base_params = {
    'learning_rate': lr,
    'max_bin': max_bin,
    'random_state': random_state
}

In [13]:
result = []

boosting_list = ['xgboost', 'catboost', 'lightgbm']
depth_list = [2, 4, 6, 8, 10]
lens_list = [1000, 5000, 10000]


for gb_type in boosting_list:
    
    print("{} is going".format(gb_type))
    
    for size_test in lens_list:
        print("size test {}".format(size_test))
        sep_test_data = test_data[:size_test]
        sep_test_target = test_target[:size_test]
        
        # comment this line if you have already trained all models
        train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)
        
        dense_test = sep_test_data.todense().A.astype(np.float32)
        
        for depth in tqdm.tqdm(depth_list):
        
            start_test_preproc = datetime.datetime.now()
            test_preprocessed = preprocess_data(dense_test,
                                                sep_test_target, 
                                                mode='test',
                                                boosting=gb_type)
        
            finish_test_preproc = datetime.datetime.now()
            preprocessing_delta = finish_test_preproc - start_test_preproc
            preprocessing_delta = preprocessing_delta.total_seconds()

            params = create_parameters(base_params, boosting=gb_type, depth=depth)
            params['depth'] = depth
            fname = create_path(gb_type, params)
            if os.path.exists(fname):
                print("model exist")
                bst = load_model(fname, boosting=gb_type)
            else:
                print("model is training")
                start_train = datetime.datetime.now()
                bst = train(train_preprocessed, params, num_iters=num_iters, boosting=gb_type)
                finish_train = datetime.datetime.now()
                delta_train = finish_train - start_train
                delta_train = int(delta_train.total_seconds() * 1000)
                bst.save_model(fname)

            start_time = datetime.datetime.now()
            preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
            assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)
            finish_time = datetime.datetime.now()

            delta = finish_time - start_time
            delta = delta.total_seconds()

            current_res = {
            'preprocessing_time': preprocessing_delta,
            'boosting': gb_type,
            'test_size': size_test,
            'depth': depth,
            'time': delta,
            }

            result.append(current_res)

        print("*" * 40)

In [14]:
result_df = pd.DataFrame(result)

In [16]:
result_df.to_csv("shap_benchmark_{}_max_bin_with_test_sizes.csv".format(max_bin), index=False)

In [17]:
result_df = pd.read_csv("shap_benchmark_128_max_bin_with_test_sizes.csv", )
result_df.pivot_table(index=["test_size", "depth"], columns="boosting", values="time")


Out[17]:
boosting catboost lightgbm xgboost
test_size depth
1000 2 0.311027 0.090156 0.112515
4 0.281931 0.578531 0.300671
6 0.464603 4.159926 1.468442
8 4.918599 23.844245 7.847191
10 93.152000 119.527824 30.872254
5000 2 1.171963 0.284673 0.241316
4 1.081119 2.094985 0.931881
6 1.319114 20.624486 6.498283
8 5.807985 118.552238 38.992395
10 95.049909 601.251603 153.408904
10000 2 2.048301 0.621454 0.509722
4 2.263058 4.291201 1.935541
6 2.396371 42.788038 12.981580
8 7.078056 240.614644 77.883250
10 95.680684 1189.685032 306.529277

In [18]:
result_df.pivot_table(index="test_size", columns="boosting", values="preprocessing_time")


Out[18]:
boosting catboost lightgbm xgboost
test_size
1000 0.069569 0.002816 0.011025
5000 0.349831 0.000006 0.047836
10000 0.770179 0.000006 0.089032

In [ ]: