This notebook illustrates finding feature importance in the Boston dataset. It is a version of the Scikit-Learn example Plotting Cross-Validated Predictions.
The main point it shows is using pandas structures throughout the code, and integrate nicely with seaborn.
In [14]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import model_selection
import seaborn as sns
sns.set_style('whitegrid')
sns.despine()
from ibex import trans
from ibex.sklearn import linear_model as pd_linear_model
from ibex.sklearn import decomposition as pd_decomposition
from ibex.sklearn import preprocessing as pd_preprocessing
from ibex.sklearn import ensemble as pd_ensemble
from ibex import xgboost as pd_xgboost
from ibex.sklearn import model_selection as pd_model_selection
%pylab inline
First we load the dataset into a pandas.DataFrame.
In [9]:
dataset = datasets.load_boston()
boston = pd.DataFrame(dataset.data, columns=dataset.feature_names)
features = dataset.feature_names
boston['price'] = dataset.target
boston.head()
Out[9]:
We will use a linear predictor, and a random forest predictor.
In [10]:
linear_y_hat = pd_model_selection.cross_val_predict(
pd_linear_model.LinearRegression(),
boston[features],
boston.price)
linear_y_hat.head()
Out[10]:
In [11]:
linear_cv= pd.concat([linear_y_hat, boston.price], axis=1)
linear_cv['type'] = 'linear'
linear_cv.columns = ['y_hat', 'y', 'regressor']
linear_cv.head()
Out[11]:
In [32]:
rf_y_hat = pd_model_selection.cross_val_predict(
pd_ensemble.RandomForestRegressor(),
boston[features],
boston.price)
rf_cv= pd.concat([rf_y_hat, boston.price], axis=1)
rf_cv['type'] = 'rf'
rf_cv.columns = ['y_hat', 'y', 'regressor']
In [33]:
xgb_rf_y_hat = pd_model_selection.cross_val_predict(
pd_xgboost.XGBRegressor(),
boston[features],
boston.price)
xgb_rf_cv= pd.concat([xgb_rf_y_hat, boston.price], axis=1)
xgb_rf_cv['type'] = 'xgb_rf'
xgb_rf_cv.columns = ['y_hat', 'y', 'regressor']
In [34]:
cvs = pd.concat([linear_cv, rf_cv, xgb_rf_cv])
In [35]:
cvs.regressor.unique()
Out[35]:
Finally, we can plot the results:
In [37]:
min_, max_ = cvs[['y_hat', 'y']].min().min(), cvs[['y_hat', 'y']].max().max()
sns.lmplot(
x='y',
y='y_hat',
hue='regressor',
data=cvs,
palette={'linear': 'grey', 'rf': 'brown', 'xgb_rf': 'green'});
plot(np.linspace(min_, max_, 100), np.linspace(min_, max_, 100), '--', color='darkgrey');
tick_params(colors='0.6')
xlim((min_, max_))
ylim((min_, max_))
figtext(
0,
-0.1,
'Cross-validated predictions for linear and random-forest regressor on the price in the Boston dataset;\n'
'the linear regressor has inferior performance here, in particular for lower prices');
In [8]: