In [1]:
import pandas as pd
import numpy as np
import sklearn.linear_model as sl

In [31]:
ncaa = pd.read_csv("http://www4.stat.ncsu.edu/~boos/var.select/ncaa.data2.txt", 
                   delim_whitespace = True)
x = ncaa.ix[:,:-1]
y = ncaa.ix[:,-1]
x.head()


Out[31]:
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19
0 13 17 9 15 28.0 0 -1.14045 3.660 4.490 3409 65.8 18 81 42.2 660000 77 100 59 1
1 28 20 32 18 18.4 18 -0.13719 2.594 3.610 7258 66.3 17 82 40.5 150555 88 94 41 25
2 32 20 20 20 34.8 18 1.55358 2.060 4.930 6405 75.0 19 71 46.5 415400 94 81 25 36
3 32 21 24 21 14.5 20 2.05712 2.887 3.876 18294 66.0 16 84 42.2 211000 93 88 26 13
4 24 20 16 20 21.8 13 -0.77082 2.565 4.960 8259 63.5 16 91 41.2 44000 90 92 32 31

In [33]:
def lasso_fit(x, y):
    
    #Define the alpha values to test
    alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
    
    #Initialize the dataframe to store coefficients
    col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,x.shape[1]+1)]
    ind = [str(alpha_lasso[i]) for i in range(0,10)]
    coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
    
    for i in range(10):
        ls = sl.Lasso(alpha=alpha_lasso[i],normalize=True, max_iter=1e5)
        ls.fit(x,y)
        y_pred = ls.predict(x)
        #Return the result in pre-defined format
        rss = sum((y_pred-y)**2)
        ret = [rss]
        ret.extend([ls.intercept_])
        ret.extend(ls.coef_)
        coef_matrix_lasso.iloc[i,] = ret
    
    exist = np.sum(coef_matrix_lasso.ix[:,2:]==0, axis = 1)!=x.shape[1]
    if(sum(exist)==0):
        size = 0
        lm = sl.LinearRegression()
        intercept = pd.DataFrame(np.ones(x.shape[0]))
        lm.fit(intercept, y)
        fitted = lm.predict(intercept)
    else:
        alpha = pd.to_numeric(coef_matrix_lasso.index[exist][-1])
        ls = sl.Lasso(alpha=alpha,normalize=True, max_iter=1e5)
        ls.fit(x,y)
        fitted = ls.predict(x)
        index = np.array(range(len(ls.coef_)))[ls.coef_!=0]
        size = len(index)
        
    # get residuals
    residual = sum((y-fitted)**2)
    df_residual = x.shape[0] - size - 1
    
    return {'fitted':fitted, 'residual':residual, 'df_residual':df_residual, 
            'size':size, 'index': index}

In [34]:
lasso_fit(x,y)


Out[34]:
{'df_residual': 92,
 'fitted': array([ 52.92355824,  57.0009211 ,  57.0009211 ,  58.36004205,
         57.0009211 ,  54.28267919,  58.36004205,  66.51476775,
         55.64180014,  63.79652585,  55.64180014,  59.719163  ,
         61.07828395,  59.719163  ,  69.23300966,  55.64180014,
         54.28267919,  62.4374049 ,  58.36004205,  51.56443729,
         62.4374049 ,  62.4374049 ,  57.0009211 ,  54.28267919,
         63.79652585,  58.36004205,  59.719163  ,  59.719163  ,
         58.36004205,  57.0009211 ,  55.64180014,  59.719163  ,
         54.28267919,  58.36004205,  52.92355824,  61.07828395,
         55.64180014,  59.719163  ,  62.4374049 ,  63.79652585,
         58.36004205,  59.719163  ,  57.0009211 ,  57.0009211 ,
         62.4374049 ,  57.0009211 ,  58.36004205,  58.36004205,
         55.64180014,  62.4374049 ,  67.87388871,  59.719163  ,
         58.36004205,  58.36004205,  59.719163  ,  55.64180014,
         57.0009211 ,  61.07828395,  59.719163  ,  59.719163  ,
         67.87388871,  62.4374049 ,  55.64180014,  52.92355824,
         51.56443729,  58.36004205,  55.64180014,  67.87388871,
         62.4374049 ,  59.719163  ,  59.719163  ,  58.36004205,
         51.56443729,  59.719163  ,  61.07828395,  57.0009211 ,
         54.28267919,  61.07828395,  61.07828395,  65.1556468 ,
         62.4374049 ,  62.4374049 ,  55.64180014,  57.0009211 ,
         65.1556468 ,  65.1556468 ,  59.719163  ,  57.0009211 ,
         65.1556468 ,  59.719163  ,  58.36004205,  55.64180014,
         61.07828395,  57.0009211 ]),
 'index': array([1]),
 'residual': 15913.861386451388,
 'size': 1}

In [ ]: