In [2]:
# Using the spareness associated with L1 norms to preprocess
# the features. Similar to Lasso Regression example.

In [3]:
# Steps:
# -> Use diabetes dataset to fit a regression.
# -> Fit a basic Linear Regression model with a ShuffleSplit CV.
# -> Use LassoRegression to find the coefficients that are 0 when
#    using an L1 penalty.
# -> Use feature selection to remove uninformative features.
# -> Refit the linear regression and check to see how well it fits
#    compared with the fully featured model.

In [4]:
import sklearn.datasets as ds
diabetes = ds.load_diabetes()

In [5]:
from sklearn import linear_model
lr = linear_model.LinearRegression()

In [6]:
from sklearn import metrics
from sklearn import cross_validation

In [7]:
shuff = cross_validation.ShuffleSplit(diabetes.target.size)

In [8]:
mses = []
for train, test in shuff:
    train_X = diabetes.data[train]
    train_y = diabetes.target[train]
    
    test_X = diabetes.data[test]
    test_y = diabetes.target[test]
    
    lr.fit(train_X, train_y)
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))

In [9]:
import numpy as np

In [10]:
np.mean(mses)


Out[10]:
2633.9355340514398

In [11]:
# this number is our regular fit on all the features. Now, test
# to see if eliminating any features helps.

In [12]:
from sklearn import feature_selection

In [14]:
cv = linear_model.LassoCV()
cv.fit(diabetes.data, diabetes.target)
cv.coef_


Out[14]:
array([  -0.        , -226.2375274 ,  526.85738059,  314.44026013,
       -196.92164002,    1.48742026, -151.78054083,  106.52846989,
        530.58541123,   64.50588257])

In [15]:
# We can remove the first feature because it's 0

In [16]:
# Use a numpy array to represent the columns that are to be
# included in the model.

In [17]:
columns = np.arange(diabetes.data.shape[1])[cv.coef_ != 0]
columns


Out[17]:
array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [19]:
l1mses = []
for train, test in shuff:
    train_X = diabetes.data[train][:, columns]
    train_y = diabetes.target[train]
    
    test_X = diabetes.data[~train][:, columns]
    test_y = diabetes.target[~train]
    
    lr.fit(train_X, train_y)
    
    l1mses.append(metrics.mean_squared_error(test_y,
                                             lr.predict(test_X)))

In [20]:
np.mean(l1mses)


Out[20]:
2878.0243806427325

In [21]:
np.mean(l1mses) - np.mean(mses)


Out[21]:
244.08884659129262

In [22]:
# How it works...

In [23]:
# first create a regression dataset with many uninformative
# features.
X, y = ds.make_regression(noise=5)

In [24]:
# first fit a normal regression.

In [25]:
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
    train_X = X[train]
    train_y = y[train]
    
    test_X = X[test]
    test_y = y[test]
    
    lr.fit(train_X, train_y)
    
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))

In [26]:
np.mean(mses)


Out[26]:
6078.6396071184108

In [27]:
# Now, do the same process for Lasso Regression.
cv.fit(X, y)


Out[27]:
LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [28]:
columns = np.arange(X.shape[1])[cv.coef_ != 0]
columns[:5]


Out[28]:
array([ 4,  6, 11, 20, 23])

In [29]:
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
    train_X = X[train][:, columns]
    train_y = y[train]
    
    test_X = X[test][:, columns]
    test_y = y[test]
    
    lr.fit(train_X, train_y)
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))

In [30]:
np.mean(mses)


Out[30]:
22.221804944027149

In [ ]: