Ridge regression and model selection

Modified from the github repo: https://github.com/JWarmenhoven/ISLR-python which is based on the book by James et al. Intro to Statistical Learning.

Loading data



In [2]:

    
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale 
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

%matplotlib inline
plt.style.use('ggplot')
datafolder = "../data/"

def loo_risk(X,y,regmod):
    """
    Construct the leave-one-out square error risk for a regression model
    
    Input: design matrix, X, response vector, y, a regression model, regmod
    Output: scalar LOO risk
    """
    loo = LeaveOneOut()
    loo_losses = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regmod.fit(X_train,y_train)
        y_hat = regmod.predict(X_test)
        loss = np.sum((y_hat - y_test)**2)
        loo_losses.append(loss)
    return np.mean(loo_losses)

def emp_risk(X,y,regmod):
    """
    Return the empirical risk for square error loss
    
    Input: design matrix, X, response vector, y, a regression model, regmod
    Output: scalar empirical risk
    """
    regmod.fit(X,y)
    y_hat = regmod.predict(X)
    return np.mean((y_hat - y)**2)



In [3]:

    
# In R, I exported the dataset from package 'ISLR' to a csv file.
df = pd.read_csv(datafolder+'Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 20 columns):
AtBat        263 non-null int64
Hits         263 non-null int64
HmRun        263 non-null int64
Runs         263 non-null int64
RBI          263 non-null int64
Walks        263 non-null int64
Years        263 non-null int64
CAtBat       263 non-null int64
CHits        263 non-null int64
CHmRun       263 non-null int64
CRuns        263 non-null int64
CRBI         263 non-null int64
CWalks       263 non-null int64
League       263 non-null object
Division     263 non-null object
PutOuts      263 non-null int64
Assists      263 non-null int64
Errors       263 non-null int64
Salary       263 non-null float64
NewLeague    263 non-null object
dtypes: float64(1), int64(16), object(3)
memory usage: 43.1+ KB



In [4]:

    
df.head()









    Out[4]:






  
    
      
      AtBat
      Hits
      HmRun
      Runs
      RBI
      Walks
      Years
      CAtBat
      CHits
      CHmRun
      CRuns
      CRBI
      CWalks
      League
      Division
      PutOuts
      Assists
      Errors
      Salary
      NewLeague
    
    
      Player
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      -Alan Ashby
      315
      81
      7
      24
      38
      39
      14
      3449
      835
      69
      321
      414
      375
      N
      W
      632
      43
      10
      475.0
      N
    
    
      -Alvin Davis
      479
      130
      18
      66
      72
      76
      3
      1624
      457
      63
      224
      266
      263
      A
      W
      880
      82
      14
      480.0
      A
    
    
      -Andre Dawson
      496
      141
      20
      65
      78
      37
      11
      5628
      1575
      225
      828
      838
      354
      N
      E
      200
      11
      3
      500.0
      N
    
    
      -Andres Galarraga
      321
      87
      10
      39
      42
      30
      2
      396
      101
      12
      48
      46
      33
      N
      E
      805
      40
      4
      91.5
      N
    
    
      -Alfredo Griffin
      594
      169
      4
      74
      51
      35
      11
      4408
      1133
      19
      501
      336
      194
      A
      W
      282
      421
      25
      750.0
      A



In [5]:

    
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())









    



<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 6 columns):
League_A       263 non-null float64
League_N       263 non-null float64
Division_E     263 non-null float64
Division_W     263 non-null float64
NewLeague_A    263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(6)
memory usage: 14.4+ KB
                   League_A  League_N  Division_E  Division_W  NewLeague_A  \
Player                                                                       
-Alan Ashby             0.0       1.0         0.0         1.0          0.0   
-Alvin Davis            1.0       0.0         0.0         1.0          1.0   
-Andre Dawson           0.0       1.0         1.0         0.0          0.0   
-Andres Galarraga       0.0       1.0         1.0         0.0          0.0   
-Alfredo Griffin        1.0       0.0         0.0         1.0          1.0   

                   NewLeague_N  
Player                          
-Alan Ashby                1.0  
-Alvin Davis               0.0  
-Andre Dawson              1.0  
-Andres Galarraga          1.0  
-Alfredo Griffin           0.0



In [6]:

    
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 19 columns):
AtBat          263 non-null float64
Hits           263 non-null float64
HmRun          263 non-null float64
Runs           263 non-null float64
RBI            263 non-null float64
Walks          263 non-null float64
Years          263 non-null float64
CAtBat         263 non-null float64
CHits          263 non-null float64
CHmRun         263 non-null float64
CRuns          263 non-null float64
CRBI           263 non-null float64
CWalks         263 non-null float64
PutOuts        263 non-null float64
Assists        263 non-null float64
Errors         263 non-null float64
League_N       263 non-null float64
Division_W     263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(19)
memory usage: 41.1+ KB



In [7]:

    
X.head(5)









    Out[7]:






  
    
      
      AtBat
      Hits
      HmRun
      Runs
      RBI
      Walks
      Years
      CAtBat
      CHits
      CHmRun
      CRuns
      CRBI
      CWalks
      PutOuts
      Assists
      Errors
      League_N
      Division_W
      NewLeague_N
    
    
      Player
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      -Alan Ashby
      315.0
      81.0
      7.0
      24.0
      38.0
      39.0
      14.0
      3449.0
      835.0
      69.0
      321.0
      414.0
      375.0
      632.0
      43.0
      10.0
      1.0
      1.0
      1.0
    
    
      -Alvin Davis
      479.0
      130.0
      18.0
      66.0
      72.0
      76.0
      3.0
      1624.0
      457.0
      63.0
      224.0
      266.0
      263.0
      880.0
      82.0
      14.0
      0.0
      1.0
      0.0
    
    
      -Andre Dawson
      496.0
      141.0
      20.0
      65.0
      78.0
      37.0
      11.0
      5628.0
      1575.0
      225.0
      828.0
      838.0
      354.0
      200.0
      11.0
      3.0
      1.0
      0.0
      1.0
    
    
      -Andres Galarraga
      321.0
      87.0
      10.0
      39.0
      42.0
      30.0
      2.0
      396.0
      101.0
      12.0
      48.0
      46.0
      33.0
      805.0
      40.0
      4.0
      1.0
      0.0
      1.0
    
    
      -Alfredo Griffin
      594.0
      169.0
      4.0
      74.0
      51.0
      35.0
      11.0
      4408.0
      1133.0
      19.0
      501.0
      336.0
      194.0
      282.0
      421.0
      25.0
      0.0
      1.0
      0.0

Ridge Regression



In [8]:

    
alphas = 10**np.linspace(10,-2,100)*0.5

ridge = Ridge()
coefs = []

for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(scale(X), y)
    coefs.append(ridge.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');

The above plot shows that the Ridge coefficients get larger when we decrease lambda.

Exercises

Exercise 1 Plot the LOO risk and the empirical risk as a function of lambda.



In [34]:

    
alphas = np.linspace(30,1,100)
rcv = RidgeCV(alphas = alphas, store_cv_values=True,normalize=True)
rcv.fit(X,y)
cv_vals = rcv.cv_values_
LOOr = cv_vals.mean(axis=0)



In [37]:

    
EMPr = []
for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(scale(X), y)
    EMPr.append(emp_risk(X,y,ridge))



In [43]:

    
plt.plot(alphas,LOOr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('LOO Risk for Ridge');
plt.show()



In [44]:

    
plt.plot(alphas,EMPr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('Emp Risk for Ridge');
plt.show()

Exercise 2 Implement and test forward stagewise regression (recall that stagewise and stepwise are different).



In [9]:

    
n,p = X.shape
Xsc = scale(X)
ysc = scale(y)

I'll implement a different variant of forward stagewise, where the correlation updates the current beta vector by adding them.



In [35]:

    
MSEiter = []
res = ysc
beta = np.zeros(p)
tol = 1e-2
corrmax = 1.
while corrmax > tol:
    res_corr = Xsc.T.dot(scale(res)) / n
    jmax, corrmax = max(enumerate(np.abs(res_corr)), key=lambda x: x[1])
    beta[jmax] = beta[jmax] + res_corr[jmax]
    res = ysc - Xsc.dot(beta)
    MSE = np.sum(res**2.)
    MSEiter.append(MSE)



In [34]:

    
beta









    Out[34]:





array([-0.62212321,  0.68773196,  0.03909558, -0.07212508, -0.02225624,
        0.27844298, -0.09305252, -0.42239296,  0.        ,  0.        ,
        0.86104284,  0.5182802 , -0.44004932,  0.16924078,  0.09478295,
       -0.04392054,  0.06021227, -0.13394125, -0.01738431])



In [38]:

    
lm = LinearRegression()
lm.fit(Xsc,ysc)









    Out[38]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [39]:

    
lm.coef_









    Out[39]:





array([-0.64650293,  0.75030052,  0.08407102, -0.13452771, -0.05995418,
        0.29999074, -0.03707491, -0.86847257,  0.19252781, -0.03149673,
        1.06770954,  0.57897605, -0.47504143,  0.17492395,  0.11933652,
       -0.0492179 ,  0.06940156, -0.12973401, -0.02742594])



In [ ]:

	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	CHmRun	CRuns	CRBI	CWalks	League	Division	PutOuts	Assists	Errors	Salary	NewLeague
Player
-Alan Ashby	315	81	7	24	38	39	14	3449	835	69	321	414	375	N	W	632	43	10	475.0	N
-Alvin Davis	479	130	18	66	72	76	3	1624	457	63	224	266	263	A	W	880	82	14	480.0	A
-Andre Dawson	496	141	20	65	78	37	11	5628	1575	225	828	838	354	N	E	200	11	3	500.0	N
-Andres Galarraga	321	87	10	39	42	30	2	396	101	12	48	46	33	N	E	805	40	4	91.5	N
-Alfredo Griffin	594	169	4	74	51	35	11	4408	1133	19	501	336	194	A	W	282	421	25	750.0	A

	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	CHmRun	CRuns	CRBI	CWalks	PutOuts	Assists	Errors	League_N	Division_W	NewLeague_N
Player
-Alan Ashby	315.0	81.0	7.0	24.0	38.0	39.0	14.0	3449.0	835.0	69.0	321.0	414.0	375.0	632.0	43.0	10.0	1.0	1.0	1.0
-Alvin Davis	479.0	130.0	18.0	66.0	72.0	76.0	3.0	1624.0	457.0	63.0	224.0	266.0	263.0	880.0	82.0	14.0	0.0	1.0	0.0
-Andre Dawson	496.0	141.0	20.0	65.0	78.0	37.0	11.0	5628.0	1575.0	225.0	828.0	838.0	354.0	200.0	11.0	3.0	1.0	0.0	1.0
-Andres Galarraga	321.0	87.0	10.0	39.0	42.0	30.0	2.0	396.0	101.0	12.0	48.0	46.0	33.0	805.0	40.0	4.0	1.0	0.0	1.0
-Alfredo Griffin	594.0	169.0	4.0	74.0	51.0	35.0	11.0	4408.0	1133.0	19.0	501.0	336.0	194.0	282.0	421.0	25.0	0.0	1.0	0.0