In [30]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.tree import ExtraTreeRegressor
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
In [31]:
# Load the boston housing data
boston_house_data = load_boston()
# Create a data frame of samples and feature values.
data_X_df = pd.DataFrame(boston_house_data.data, columns=boston_house_data.feature_names)
data_X_df.head()
Out[31]:
In [32]:
data_scaler = preprocessing.MinMaxScaler()
target_scaler = preprocessing.MinMaxScaler()
data = data_scaler.fit_transform(boston_house_data.data)
target = target_scaler.fit_transform(boston_house_data.target)
# data = data_X_df.values
# target = boston_house_data.target
In [33]:
# Print the dimensions of train and test data.
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=4)
print "Dimension of X_train =", X_train.shape
print "Dimension of y_train =", y_train.shape
print "Dimension of X_test =", X_test.shape
print "Dimension of y_test =", y_test.shape
In [34]:
class StackedRegressor():
def __init__(self, base_regressors, meta_regressor):
"""Constructor for StackedRegressor. Takes list of base_regressors and a meta_regressor"""
self.__base_regressors = base_regressors
self.__meta_regressor = meta_regressor
self.__kbest = None
def fit(self, X, y, split=True, kbest=None):
if kbest:
kb = SelectKBest(f_regression, k=kbest).fit(X, y)
self.__kbest = kb.scores_.argsort()[-kbest:]
X = X[:, self.__kbest]
if split:
# Split the data so that it will not over fit.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=4)
else:
X_train, X_test, y_train, y_test = X, X, y, y
# Fit and predict the train data on the level0 regressors.
meta_input = [regressor.fit(X_train, y_train).predict(X_test) for regressor in self.__base_regressors]
# Fit the predicted values of above level 0 classifiers into meta regressor
X_meta = np.array(meta_input).transpose()
self.__meta_regressor.fit(X_meta, y_test)
return self
def predict(self, X):
if not self.__kbest is None:
X = X[:, self.__kbest]
# Predict the test data on level0 regressors
self.base_regressors_predict_ = [regressor.predict(X) for regressor in self.__base_regressors]
# Predict the final values.
X_meta = np.array(self.base_regressors_predict_).transpose()
return self.__meta_regressor.predict(X_meta)
def scores(self, X, y):
if not self.__kbest is None:
X = X[:, self.__kbest]
X_meta = np.array([regressor.predict(X) for regressor in self.__base_regressors]).transpose()
self.score_base_regressors_ = [regressor.score(X, y) for regressor in self.__base_regressors]
self.score_meta_regressor_ = self.__meta_regressor.score(X_meta, y)
self.mse_base_regressors_ = [mean_squared_error(y, X_meta[:, i]) for i in range(X_meta.shape[1])]
self.mse_meta_regressor_ = mean_squared_error(y, self.__meta_regressor.predict(X_meta))
In [35]:
def evaluate_model(base_regressors, meta_regressor, names, split=True, kbest=None):
stacked_regressor = StackedRegressor(base_regressors=base_regressors, meta_regressor=meta_regressor)
stacked_regressor.fit(X_train, y_train, split, kbest)
stacked_regressor.scores(X_test, y_test)
print "Scores of base regressors on test data =", stacked_regressor.score_base_regressors_
print "Score of meta regressor on test data =", stacked_regressor.score_meta_regressor_
print "Mean squared error of base regressors on test data =", stacked_regressor.mse_base_regressors_
print "Mean squared error of meta regressor on test data =", stacked_regressor.mse_meta_regressor_
predicted_y = stacked_regressor.predict(X_test)
df = pd.DataFrame(stacked_regressor.base_regressors_predict_ + [predicted_y, y_test],index=names + ["Original"]).T
df2 = pd.DataFrame(
{"MSE": stacked_regressor.mse_base_regressors_ + [stacked_regressor.mse_meta_regressor_],
"SCORE" : stacked_regressor.score_base_regressors_ + [stacked_regressor.score_meta_regressor_]},
index=names)
df2.plot(kind='bar', alpha=0.5, grid=True, rot=45, subplots=True, layout=(1,2), legend=False, figsize=(12, 4))
return df
Base regressors:
ExtraTreeRegressor with max_depth=2
LinearRegression
Meta regressor
Best features
In [36]:
base_regressors=[ExtraTreeRegressor(max_depth=2), LinearRegression()]
meta_regressor = Ridge(alpha=0.5)
names = ["Extra Tree (max_depth=2)", "Linear Regression", "Stacked Regressor"]
evaluate_model(base_regressors, meta_regressor, names, split=True, kbest=5).head()
Out[36]:
Base regressors:
DecisionTreeRegressor with max_depth=2
DecisionTreeRegressor with max_depth=3
Meta regressor
Details
We split the train data into sets so that decision tree regressor dont perfectly predict.
Default parameters of DecisionTreeRegressor perfectly predicts the train data so we must be careful.
Use max_depth=2 to reduce over fitting.
Stacking in this case improves but not a lot.
In [37]:
base_regressors=[DecisionTreeRegressor(max_depth=2), DecisionTreeRegressor(max_depth=3)]
meta_regressor = DecisionTreeRegressor(max_depth=3)
names = ["Decision Tree (max_depth=2)", "Decision Tree (max_depth=3)", "Stacked Regressor"]
evaluate_model(base_regressors, meta_regressor, names).head()
Out[37]:
In [38]:
base_regressors=[DecisionTreeRegressor(max_depth=2), DecisionTreeRegressor(max_depth=3)]
meta_regressor = LinearRegression()
names = ["Decision Tree (max_depth=2)", "Decision Tree (max_depth=3)", "Stacked Regressor"]
evaluate_model(base_regressors, meta_regressor, names).head()
Out[38]:
In [39]:
base_regressors=[DecisionTreeRegressor(max_depth=2), LinearRegression()]
meta_regressor = Ridge()
names = ["Decision Tree (max_depth=2)", "Linear Regression", "Stacked Regressor"]
evaluate_model(base_regressors, meta_regressor, names).head()
Out[39]:
In [40]:
base_regressors=[Ridge(), LinearRegression()]
meta_regressor = LinearRegression()
names = ["Ridge", "Linear", "Stacked"]
evaluate_model(base_regressors, meta_regressor, names).head()
Out[40]:
In [41]:
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
lm_model = LinearRegression()
base_regressors=[svr_poly, lm_model]
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls'}
gb_clf = ensemble.GradientBoostingRegressor(**params)
meta_regressor = gb_clf
names = ["SVR", "LR", "Stacked Regressor (GBR)"]
evaluate_model(base_regressors, meta_regressor, names, split=True, kbest=None).head()
Out[41]:
In [42]:
base_regressors = [Ridge(fit_intercept=True, normalize=True),
ensemble.RandomForestRegressor(n_estimators=20, warm_start=True),
ensemble.GradientBoostingRegressor(n_estimators=100, warm_start=True)]
meta_regressor = LinearRegression()
names = ["Ridge", "RF", "GBR", "LR (meta)"]
evaluate_model(base_regressors, meta_regressor, names, split=True, kbest=None).head()
Out[42]:
In [ ]: