In this notebook, we demonstrate the interoperability of arboretum with sklearn.model_selection for cross-validation and parameter search. We will also use an example involving feature selection and a pipeline. We will be working with the ALS dataset. This is a wide noisy dataset that tree models struggle with.
In [1]:
from arboretum.datasets import load_als
from arboretum import RFRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
xtr, ytr, xte, yte = load_als()
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5)
rf.fit(xtr, ytr)
myrf = RFRegressor(n_trees=100, min_leaf=5)
myrf.fit(xtr, ytr)
Out[1]:
In [2]:
pred = rf.predict(xte)
mypred = myrf.predict(xte)
mse(yte, pred), mse(yte, mypred)
Out[2]:
In [5]:
rf.max_features = 30
params = {'min_samples_leaf':[1, 5, 10, 20]}
gcv = GridSearchCV(rf, params, 'neg_mean_squared_error')
gcv.fit(xtr, ytr)
pred = gcv.predict(xte)
mse(yte, pred), gcv.best_score_, gcv.best_params_
Out[5]:
In [7]:
myrf.max_features = 30
myparams = {'min_leaf':[1, 5, 10, 20]}
mygcv = GridSearchCV(myrf, myparams, 'neg_mean_squared_error')
mygcv.fit(xtr, ytr)
mypred = mygcv.predict(xte)
mse(yte, mypred), mygcv.best_score_, mygcv.best_params_
Out[7]:
In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
rf.max_features = None
skb = SelectKBest(f_regression, k=30)
pipe = Pipeline([('select', skb), ('model', rf)])
pipe.fit(xtr, ytr)
pred = pipe.predict(xte)
mse(yte, pred)
Out[10]:
In [11]:
myrf.max_features = None
mypipe = Pipeline([('select', skb), ('model', myrf)])
mypipe.fit(xtr, ytr)
mypred = mypipe.predict(xte)
mse(yte, mypred)
Out[11]: