In [325]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from statsmodels.tools.eval_measures import rmse
import matplotlib.pylab as plt
In [326]:
# Make pylab inline and set the theme to 'ggplot'
plt.style.use('ggplot')
%pylab inline
In [327]:
# Read Boston Housing Data
data = pd.read_csv('Datasets/Housing.csv')
In [328]:
# Create a data frame with all the independent features
data_indep = data.drop('medv', axis = 1)
# Create a target vector(vector of dependent variable, i.e. 'medv')
data_dep = data['medv']
# Split data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(data_indep, data_dep,
test_size = 0.20,
random_state = 42)
In [329]:
# Now let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
# Set a random seed so that we can reproduce the results
np.random.seed(32767)
# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')
fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)
# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
In [330]:
# Suppress printing numpy array in scientific notation
np.set_printoptions(suppress=True)
error = predict - test_y
# Print squared errors in all test samples
np.around(error ** 2, decimals = 2)
Out[330]:
In [331]:
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')
fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)
# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
In [332]:
error = predict - test_y
# Print squared errors in all test samples
np.around(error ** 2, decimals = 2)
Out[332]:
As apparent from RMSE errors of L1 and L2 loss functions, Least Squares(L2) outperform L1, when there are no outliers in the data.
In [333]:
# Some statistics about the Housing Data
data.describe()
Out[333]:
After looking at the minimum and maximum values of 'medv' column, we can see that the range of values in 'medv' is [5, 50].
Let's add a few Outliers in this Dataset, so that we can see some significant differences with L1 and L2 loss functions.
In [334]:
# Get upper and lower bounds[min, max] of all the features
stats = data.describe()
extremes = stats.loc[['min', 'max'],:].drop('medv', axis = 1)
extremes
Out[334]:
Now, we are going to generate 5 random samples, such that their values lies in the [min, max] range of respective features.
In [335]:
# Set a random seed
np.random.seed(1234)
# Create 5 random values
rands = np.random.rand(5, 1)
rands
Out[335]:
In [336]:
# Get the 'min' and 'max' rows as numpy array
min_array = np.array(extremes.loc[['min'], :])
max_array = np.array(extremes.loc[['max'], :])
# Find the difference(range) of 'max' and 'min'
range = max_array - min_array
range
Out[336]:
In [337]:
# Generate 5 samples with 'rands' value
outliers_X = (rands * range) + min_array
outliers_X
Out[337]:
In [338]:
# We will also create some hard coded outliers for 'medv', i.e. our target
medv_outliers = np.array([0, 0, 600, 700, 600])
In [339]:
# Let's have a look at the data types of all the columns
# so that we can create our new Dataset compatible with the original one
data_indep.dtypes
Out[339]:
In [340]:
# Change the type of 'chas', 'rad' and 'tax' to rounded of Integers
outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]]))
In [341]:
# Finally concatenate our existing 'train_X' and 'train_y' with these outliers
train_X = np.append(train_X, outliers_X, axis = 0)
train_y = np.append(train_y, medv_outliers, axis = 0)
In [342]:
# Plot a histogram of 'medv' in train_y
fig = plt.figure(figsize=(13,7))
plt.hist(train_y, bins=50, range = (-10, 800))
fig.suptitle('medv Count', fontsize = 20)
plt.xlabel('medv', fontsize = 16)
plt.ylabel('count', fontsize = 16)
Out[342]:
You can see there are some clear outliers at 600, 700 and even one or two 'medv' values are 0.
Since, our outliers are in place now, we will once again fit the GradientBoostingRegressor with L1 and L2 Loss function to see the contrast in their performances with outliers.
In [343]:
# So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
np.random.seed(9876)
# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')
fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)
# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
In [344]:
error = predict - test_y
# Print squared errors in all test samples
np.around(error ** 2, decimals = 2)
Out[344]:
In [345]:
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')
fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)
# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
In [346]:
error = predict - test_y
# Print squared errors in all test samples
np.around(error ** 2, decimals = 2)
Out[346]:
With outliers in the dataset, a L2(Loss function) tries to adjust the coefficients features according to these outliers on the expense of other features, since the squared-error is going to be huge for these outliers(for error > 1). On the other hand L1(Least absolute deviation) is quite robust to outliers.
As a result, L2 loss function may result in huge deviations in some of the samples which results in reduced accuracy.