In [1]:
from preamble import *
% matplotlib notebook

Gradient Boosting


In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))


accuracy on training set: 1.000000
accuracy on test set: 0.958042

In [3]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))


accuracy on training set: 0.990610
accuracy on test set: 0.972028

In [4]:
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))


accuracy on training set: 0.988263
accuracy on test set: 0.965035

In [5]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

plt.barh(range(cancer.data.shape[1]), gbrt.feature_importances_)
plt.yticks(range(cancer.data.shape[1]), cancer.feature_names);
ax = plt.gca()
ax.set_position([0.4, .2, .9, .9])



In [8]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("accuracy on training set: %f" % xgb.score(X_train, y_train))
print("accuracy on test set: %f" % xgb.score(X_test, y_test))


accuracy on training set: 1.000000
accuracy on test set: 0.979021

In [10]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=1000)
xgb.fit(X_train, y_train)
print("accuracy on training set: %f" % xgb.score(X_train, y_train))
print("accuracy on test set: %f" % xgb.score(X_test, y_test))


accuracy on training set: 1.000000
accuracy on test set: 0.979021

Exercise

Use GradientBoostingRegressor on the Bike dataset. Search over the learning_rate and max_depth using GridSearchCV. What happens if you change n_estimators?

Compare the speed of XGBClassifier with GradientBoostingRegressor. How well does XGBClassifier do with defaults on the Bike dataset? Can you make it do better?