In [2]:
# Using the spareness associated with L1 norms to preprocess
# the features. Similar to Lasso Regression example.
In [3]:
# Steps:
# -> Use diabetes dataset to fit a regression.
# -> Fit a basic Linear Regression model with a ShuffleSplit CV.
# -> Use LassoRegression to find the coefficients that are 0 when
# using an L1 penalty.
# -> Use feature selection to remove uninformative features.
# -> Refit the linear regression and check to see how well it fits
# compared with the fully featured model.
In [4]:
import sklearn.datasets as ds
diabetes = ds.load_diabetes()
In [5]:
from sklearn import linear_model
lr = linear_model.LinearRegression()
In [6]:
from sklearn import metrics
from sklearn import cross_validation
In [7]:
shuff = cross_validation.ShuffleSplit(diabetes.target.size)
In [8]:
mses = []
for train, test in shuff:
train_X = diabetes.data[train]
train_y = diabetes.target[train]
test_X = diabetes.data[test]
test_y = diabetes.target[test]
lr.fit(train_X, train_y)
mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))
In [9]:
import numpy as np
In [10]:
np.mean(mses)
Out[10]:
In [11]:
# this number is our regular fit on all the features. Now, test
# to see if eliminating any features helps.
In [12]:
from sklearn import feature_selection
In [14]:
cv = linear_model.LassoCV()
cv.fit(diabetes.data, diabetes.target)
cv.coef_
Out[14]:
In [15]:
# We can remove the first feature because it's 0
In [16]:
# Use a numpy array to represent the columns that are to be
# included in the model.
In [17]:
columns = np.arange(diabetes.data.shape[1])[cv.coef_ != 0]
columns
Out[17]:
In [19]:
l1mses = []
for train, test in shuff:
train_X = diabetes.data[train][:, columns]
train_y = diabetes.target[train]
test_X = diabetes.data[~train][:, columns]
test_y = diabetes.target[~train]
lr.fit(train_X, train_y)
l1mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))
In [20]:
np.mean(l1mses)
Out[20]:
In [21]:
np.mean(l1mses) - np.mean(mses)
Out[21]:
In [22]:
# How it works...
In [23]:
# first create a regression dataset with many uninformative
# features.
X, y = ds.make_regression(noise=5)
In [24]:
# first fit a normal regression.
In [25]:
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
train_X = X[train]
train_y = y[train]
test_X = X[test]
test_y = y[test]
lr.fit(train_X, train_y)
mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))
In [26]:
np.mean(mses)
Out[26]:
In [27]:
# Now, do the same process for Lasso Regression.
cv.fit(X, y)
Out[27]:
In [28]:
columns = np.arange(X.shape[1])[cv.coef_ != 0]
columns[:5]
Out[28]:
In [29]:
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
train_X = X[train][:, columns]
train_y = y[train]
test_X = X[test][:, columns]
test_y = y[test]
lr.fit(train_X, train_y)
mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))
In [30]:
np.mean(mses)
Out[30]:
In [ ]: