In [ ]:
#@title Agreement

# Copyright (c) 2021 Kevin P. Murphy (murphyk@gmail.com) and Mahmoud Soliman (mjs@aucegypt.edu)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

In [ ]:
#@title Attribution 
#This notebook is based on the following: 
#https://www.kaggle.com/kevalm/xgboost-implementation-on-iris-dataset-python
#https://xgboost.readthedocs.io/en/latest/tutorials/index.html
#https://marcotcr.github.io/lime/tutorials/Tutorial%20-%20continuous%20and%20categorical%20features.html
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
#https://github.com/slundberg/shap/blob/master/notebooks/kernel_explainer/Iris%20classification%20with%20scikit-learn.ipynb

Setup and environment sanity checks

Check the hardware specifications for the GCP VM this notebook running on and the software stack installed.


In [ ]:
#@title Imports
from tensorflow.python.client import device_lib
from psutil import virtual_memory
import cv2
from google.colab.patches import cv2_imshow
%tensorflow_version 2.x
import tensorflow as tf
import os
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import train_test_split

In [ ]:
#@title Hardware check 



def find_accelerator():
  
  mem = virtual_memory()
  devices=device_lib.list_local_devices()
  RAM="Physical RAM: {:.2f} GB".format(mem.total/(1024*1024*1024))
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    device=["TPU at "+str(tpu.cluster_spec().as_dict()['worker'])]  
  except ValueError:
    device =[d.physical_device_desc for d in devices if d.device_type=="GPU"]
  if not device:
    return None, RAM
  return device ,  RAM 

a,r=find_accelerator()
print("Accelerator found:",a,r)

In [ ]:
#@title Install the extra required packages if any
!pip install lime -qq
import lime
import lime.lime_tabular as ll
!pip install shap -qq
import shap

In [ ]:
#@title Clone PyProbML repo and set enviroment variables
!git clone https://github.com/probml/pyprobml/ -q
os.environ["PYPROBML"]='/content/pyprobml/'

Introduction

In this notebook we will explore how to use XGBoost and sklearn to demonstrate the bagging, boosting and forests concepts

XGBoost

Support for the following features:

  1. Vanilla Gradient Boosting algorithm (also known as GBDT (Grandient boosted decisin trees) or GBM(gradient boosting machine) with support to tuning parameters, parallization and GPU support.
  2. Stochastic Gradient Boosting with sampling with uniform and gradient-based sampling support as well as sub-sampling at the row, column and column per split levels.
  3. Regularized Gradient Boosting with support to both L1 and L2 regularization(via alpha and lamda parameters respectively).
  4. Dropout-ique behaviour via DART booster.

Note that we are using the SKLearn-like api of XGBoost for simplicity.

SKLearn

supports several features for ensemble learning one of which is Random forests, which uses bagging of decision tree classifiers (weak learners) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting

Iris dataset


In [ ]:
#loading the dataset
iris = datasets.load_iris() 
X = iris.data               
y = iris.target

In [ ]:
#Splitting data into 80/20 training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Exploring boosting with xgboost


In [ ]:
#@title XGBClassifier

In [ ]:
xgbc = XGBClassifier()
xgbc

In [ ]:
#Training the classifier
xgbc.fit(X_train, y_train)
#Inferencing on testing data
xgbc_y_pred = xgbc.predict(X_test)
#Measuring accuracy
xgbc_acc=metrics.accuracy_score(y_test, xgbc_y_pred)
print('XGBClassifier accuracy is '+str(xgbc_acc))

In [ ]:
#@title Visualization of boosted tree
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

xgb.plot_tree(xgbc, num_trees=2)
fig = plt.gcf()
fig.set_size_inches(150, 100)
fig.savefig('treeIris.png')

In [ ]:
#@title Feature importance of XGBClassifier

plot_importance(xgbc)
pyplot.show()

#f1 - sepal length in cm
#f2 - sepal width in cm
#f3 - petal length in cm
#f4 - petal width in cm

In [ ]:
#@title Explanation of a sample of testing data of XGBClassifier via LIME
xgbc_lime_explainer = ll.LimeTabularExplainer(X_train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)
xgbc_i = np.random.randint(0, X_test.shape[0])
xgbc_exp = xgbc_lime_explainer.explain_instance(X_test[xgbc_i], xgbc.predict_proba, num_features=2, top_labels=1)
xgbc_exp.show_in_notebook(show_table=True, show_all=False)

In [ ]:
#@title Explanation of testing data of XGBClassifier via SHAP
shap.initjs()
# explain all the predictions in the test set
xgbc_shap_explainer = shap.KernelExplainer(xgbc.predict_proba, X_train,model_output='probability', feature_perturbation = "interventional")
xgbc_shap_values = xgbc_shap_explainer.shap_values(X_test)
shap.force_plot(xgbc_shap_explainer.expected_value[0], xgbc_shap_values[0], X_test)

In [ ]:
xgbc_shap_explainer_2=shap.TreeExplainer(xgbc)
xgbc_shap_values_2 = xgbc_shap_explainer_2.shap_values(X_test)
shap.summary_plot(xgbc_shap_values_2, X_test)

Exploring bagging (Random Forests) with sklearn


In [ ]:
#@title RandomForestClassifier

In [ ]:
skrfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)
skrfc.fit(X_train, y_train)

In [ ]:
skrfc_y_pred = skrfc.predict(X_test)
skrfc_acc=metrics.accuracy_score(y_test, skrfc_y_pred)
print('RandomForestClassifier accuracy is '+str(skrfc_acc))

In [ ]:
#@title Feature importance of RandomForestClassifier
importances = skrfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in skrfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [ ]:
#@title Explanation of a sample of testing data of RandomForestClassifier via LIME
skrfc_lime_explainer = ll.LimeTabularExplainer(X_train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)
skrfc_i = np.random.randint(0, X_test.shape[0])
skrfc_exp = skrfc_lime_explainer.explain_instance(X_test[xgbc_i], skrfc.predict_proba, num_features=2, top_labels=1)
skrfc_exp.show_in_notebook(show_table=True, show_all=False)

In [ ]:
#@title Explanation of testing data of RandomForestClassifier via SHAP
shap.initjs()
# explain all the predictions in the test set
skrfc_shap_explainer = shap.KernelExplainer(skrfc.predict_proba, X_train,model_output='probability', feature_perturbation = "interventional")
skrfc_shap_values = skrfc_shap_explainer.shap_values(X_test)
shap.force_plot(skrfc_shap_explainer.expected_value[0], skrfc_shap_values[0], X_test)

In [ ]:
skrfc_shap_explainer_2=shap.TreeExplainer(skrfc)
skrfc_shap_values_2 = skrfc_shap_explainer_2.shap_values(X_test)
shap.summary_plot(skrfc_shap_values_2, X_test)