This is a minimal but workable implementation of Gradient Boosting Decision Trees with scikit-learn
Read more in this blog post.
In [1]:
import numpy as np
from scipy.special import expit as sigmoid
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
In [2]:
class BaseGradientBoosting(BaseEstimator):
def __init__(self, n_iter, learning_rate, loss):
self.loss = loss
self.learning_rate = learning_rate
self.n_iter = n_iter
def fit(self, X, y):
rng = np.random.RandomState(seed=0)
n_samples = X.shape[0]
y_pred_train = rng.normal(size=n_samples)
self.predictors = list()
for m in range(self.n_iter): # Gradient Descent
negative_gradient = -self.loss.compute_gradients(y, y_pred_train)
new_predictor = DecisionTreeRegressor(max_depth=3)
new_predictor.fit(X, y=self.learning_rate * negative_gradient)
y_pred_train += new_predictor.predict(X)
self.predictors.append(new_predictor) # save for predict()
def predict(self, X):
return sum(predictor.predict(X) for predictor in self.predictors)
In [3]:
# GBDT for regression
class LeastSquaresLoss:
def compute_gradients(self, y_true, y_pred):
return -2 * (y_true - y_pred)
class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
def __init__(self, n_iter=100, learning_rate=.1):
super().__init__(n_iter, learning_rate, loss=LeastSquaresLoss())
X, y = make_regression(n_samples=1000, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
print('r squared on training data: {:.3f}'.format(gb.score(X_train, y_train)))
print('r squared on test data: {:.3f}'.format(gb.score(X_test, y_test)))
In [4]:
# GBDT for classification
class BinaryCrossEntropy:
def compute_gradients(self, y_true, y_pred):
return sigmoid(y_pred) - y_true
def raw_predictions_to_proba(self, raw_predictions):
return sigmoid(raw_predictions)
class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
def __init__(self, n_iter=100, learning_rate=.1):
super().__init__(n_iter, learning_rate, loss=BinaryCrossEntropy())
def predict(self, X):
raw_predictions = super().predict(X)
proba_positive_class = self.loss.raw_predictions_to_proba(raw_predictions)
return proba_positive_class > .5
X, y = make_classification(n_samples=1000, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print('accuracy on training data: {:.3f}'.format(gb.score(X_train, y_train)))
print('accuracy on test data: {:.3f}'.format(gb.score(X_test, y_test)))