In [1]:
%matplotlib inline
In [2]:
# Gradient Boosting Regression learns from mistakes. It tries
# to fit a bunch of weak learners
# -> Individually, each learner has poor accuracy but together
# they have good accuracy.
# -> They're applied sequentially meaning that each learner
# becomes an expert in the mistakes of the prior learner.
In [31]:
from sklearn.datasets import make_regression
import numpy as np
In [32]:
X, y = make_regression(1000, 2, noise=10)
In [33]:
from sklearn.ensemble import GradientBoostingRegressor as GBR
In [34]:
gbr = GBR()
gbr.fit(X, y)
Out[34]:
In [35]:
gbr_preds = gbr.predict(X)
In [36]:
gbr_preds[:5]
Out[36]:
In [37]:
np.mean(np.power(y - gbr_preds, 2))
Out[37]:
In [38]:
from sklearn.linear_model import LinearRegression
In [39]:
lr = LinearRegression()
lr.fit(X, y)
Out[39]:
In [40]:
lr_preds = lr.predict(X)
In [41]:
# see how GBR performs vs Linear Regression
gbr_residuals = y - gbr_preds
lr_residuals = y - lr_preds
In [42]:
import matplotlib.pyplot as plt
In [ ]:
In [43]:
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(gbr_residuals, label='GBR', alpha=.5, color='r', bins=30)
ax.hist(lr_residuals, label='LR', alpha=.5, color='b', bins=30)
ax.set_title("GBR vs LR")
ax.legend(loc='best')
Out[43]:
In [44]:
np.percentile(gbr_residuals, [2.5, 97.5])
Out[44]:
In [45]:
np.percentile(lr_residuals, [2.5, 97.5])
Out[45]:
In [46]:
# lines above take the 95th percentile to see error range.
In [47]:
n_estimators = np.arange(100, 1100, 350)
gbrs = [GBR(n_estimators=n_estimator) for n_estimator in n_estimators]
In [48]:
residuals = {}
for i, gbr in enumerate(gbrs):
gbr.fit(X, y)
residuals[gbr.n_estimators] = y - gbr.predict(X)
In [54]:
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
colors = ['r', 'g', 'b']
for i, gbr in enumerate(gbrs):
ax.hist(residuals[gbr.n_estimators], color=colors[i], alpha=.333, label="n_estimators: {}".format(gbr.n_estimators), bins=25)
ax.set_title("Residuals at various estimators")
ax.legend(loc='best')
Out[54]:
In [56]:
# the graph above should show that as the number of estimators
# goes up, the error should go down.
# Also should strongly consider tuning the max_depth because
# each of the learners is a tree.
# Also should strongly consider tuning the loss function because
# this determines how the error is computed; default is:
# least squares ('ls')
In [ ]: