In [1]:
%autosave 10
sklearn.tree.DecisionTreeClassifier/Regressorsklearn.ensemblehubersklearn.ensemble.AdaBoostClassifer/Regressorsklearn.ensemble.GradientBoostingClassifier/Regressor
In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_hastie_10_2
X, y = make_hastie_10_2(n_samples=10000)
est = GradientBoostingClassifier(n_estimators=200, max_depth=3)
est.fit(X, y)
pred = est.predict(X)
est.predict_proba(X)[0] # class probabilities
Out[2]:
In [4]:
for pred in est.staged_predict(X):
plt.plot(X[:, 0], pred, color='r', alpha=0.1)
In [ ]:
# X_test/Y_test, held back data
test_score = np.empty(len(est.estimators_))
for i, pred in enumerate(est.staged_predict(X_test)):
test_score[i] = est.loss_(y_test, pred)
plt.plot(np.arange(n_estimators) + 1, test_score, label='Test')
plt.plot(np.arange(n_estimators) + 1, est.train_score_, label='Train')
max_depth controls degree of feature interactionsmin_samples requires sufficient samples, adds more bias, adds contraint, more general.learning_rate, but needs higher n_estimatorssubsample: random subset of training setmax_features: random subset of featuresn_estimators high as possible, e.g. 3000param_gridgs_cv.best_params_joblib.n_estimators even higher and tune learning_rateest.features_importances_ lets you peek in the black box, plot to see what are the most relevant features, great for epxloratory phase.partial_dependence for PD plotssklearn.ensemble.partial_dependencefrom sklearn.ensemble import partial_dependence as pd
features = ['foo', 'bar']
fig, axs = pd.plot_partial_dependence(est, X_train, features, feature_names=names]
gdm is great, and heavily referenced and source of heuristics.