notebook.community

Edit and run



In [2]:

    
# Using the spareness associated with L1 norms to preprocess
# the features. Similar to Lasso Regression example.



In [3]:

    
# Steps:
# -> Use diabetes dataset to fit a regression.
# -> Fit a basic Linear Regression model with a ShuffleSplit CV.
# -> Use LassoRegression to find the coefficients that are 0 when
#    using an L1 penalty.
# -> Use feature selection to remove uninformative features.
# -> Refit the linear regression and check to see how well it fits
#    compared with the fully featured model.



In [4]:

    
import sklearn.datasets as ds
diabetes = ds.load_diabetes()



In [5]:

    
from sklearn import linear_model
lr = linear_model.LinearRegression()



In [6]:

    
from sklearn import metrics
from sklearn import cross_validation



In [7]:

    
shuff = cross_validation.ShuffleSplit(diabetes.target.size)



In [8]:

    
mses = []
for train, test in shuff:
    train_X = diabetes.data[train]
    train_y = diabetes.target[train]
    
    test_X = diabetes.data[test]
    test_y = diabetes.target[test]
    
    lr.fit(train_X, train_y)
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))



In [9]:

    
import numpy as np



In [10]:

    
np.mean(mses)









    Out[10]:





2633.9355340514398



In [11]:

    
# this number is our regular fit on all the features. Now, test
# to see if eliminating any features helps.



In [12]:

    
from sklearn import feature_selection



In [14]:

    
cv = linear_model.LassoCV()
cv.fit(diabetes.data, diabetes.target)
cv.coef_









    Out[14]:





array([  -0.        , -226.2375274 ,  526.85738059,  314.44026013,
       -196.92164002,    1.48742026, -151.78054083,  106.52846989,
        530.58541123,   64.50588257])



In [15]:

    
# We can remove the first feature because it's 0



In [16]:

    
# Use a numpy array to represent the columns that are to be
# included in the model.



In [17]:

    
columns = np.arange(diabetes.data.shape[1])[cv.coef_ != 0]
columns









    Out[17]:





array([1, 2, 3, 4, 5, 6, 7, 8, 9])



In [19]:

    
l1mses = []
for train, test in shuff:
    train_X = diabetes.data[train][:, columns]
    train_y = diabetes.target[train]
    
    test_X = diabetes.data[~train][:, columns]
    test_y = diabetes.target[~train]
    
    lr.fit(train_X, train_y)
    
    l1mses.append(metrics.mean_squared_error(test_y,
                                             lr.predict(test_X)))



In [20]:

    
np.mean(l1mses)









    Out[20]:





2878.0243806427325



In [21]:

    
np.mean(l1mses) - np.mean(mses)









    Out[21]:





244.08884659129262



In [22]:

    
# How it works...



In [23]:

    
# first create a regression dataset with many uninformative
# features.
X, y = ds.make_regression(noise=5)



In [24]:

    
# first fit a normal regression.



In [25]:

    
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
    train_X = X[train]
    train_y = y[train]
    
    test_X = X[test]
    test_y = y[test]
    
    lr.fit(train_X, train_y)
    
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))



In [26]:

    
np.mean(mses)









    Out[26]:





6078.6396071184108



In [27]:

    
# Now, do the same process for Lasso Regression.
cv.fit(X, y)









    Out[27]:





LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)



In [28]:

    
columns = np.arange(X.shape[1])[cv.coef_ != 0]
columns[:5]









    Out[28]:





array([ 4,  6, 11, 20, 23])



In [29]:

    
mses = []
shuff = cross_validation.ShuffleSplit(y.size)
for train, test in shuff:
    train_X = X[train][:, columns]
    train_y = y[train]
    
    test_X = X[test][:, columns]
    test_y = y[test]
    
    lr.fit(train_X, train_y)
    mses.append(metrics.mean_squared_error(test_y,
                                           lr.predict(test_X)))



In [30]:

    
np.mean(mses)









    Out[30]:





22.221804944027149



In [ ]: