Ridge regression and model selection

Modified from the github repo: https://github.com/JWarmenhoven/ISLR-python which is based on the book by James et al. Intro to Statistical Learning.

Loading data


In [2]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale 
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

%matplotlib inline
plt.style.use('ggplot')
datafolder = "../data/"

def loo_risk(X,y,regmod):
    """
    Construct the leave-one-out square error risk for a regression model
    
    Input: design matrix, X, response vector, y, a regression model, regmod
    Output: scalar LOO risk
    """
    loo = LeaveOneOut()
    loo_losses = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regmod.fit(X_train,y_train)
        y_hat = regmod.predict(X_test)
        loss = np.sum((y_hat - y_test)**2)
        loo_losses.append(loss)
    return np.mean(loo_losses)

def emp_risk(X,y,regmod):
    """
    Return the empirical risk for square error loss
    
    Input: design matrix, X, response vector, y, a regression model, regmod
    Output: scalar empirical risk
    """
    regmod.fit(X,y)
    y_hat = regmod.predict(X)
    return np.mean((y_hat - y)**2)

In [3]:
# In R, I exported the dataset from package 'ISLR' to a csv file.
df = pd.read_csv(datafolder+'Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 20 columns):
AtBat        263 non-null int64
Hits         263 non-null int64
HmRun        263 non-null int64
Runs         263 non-null int64
RBI          263 non-null int64
Walks        263 non-null int64
Years        263 non-null int64
CAtBat       263 non-null int64
CHits        263 non-null int64
CHmRun       263 non-null int64
CRuns        263 non-null int64
CRBI         263 non-null int64
CWalks       263 non-null int64
League       263 non-null object
Division     263 non-null object
PutOuts      263 non-null int64
Assists      263 non-null int64
Errors       263 non-null int64
Salary       263 non-null float64
NewLeague    263 non-null object
dtypes: float64(1), int64(16), object(3)
memory usage: 43.1+ KB

In [4]:
df.head()


Out[4]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
Player
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 321 414 375 N W 632 43 10 475.0 N
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 224 266 263 A W 880 82 14 480.0 A
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 828 838 354 N E 200 11 3 500.0 N
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 48 46 33 N E 805 40 4 91.5 N
-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19 501 336 194 A W 282 421 25 750.0 A

In [5]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())


<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 6 columns):
League_A       263 non-null float64
League_N       263 non-null float64
Division_E     263 non-null float64
Division_W     263 non-null float64
NewLeague_A    263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(6)
memory usage: 14.4+ KB
                   League_A  League_N  Division_E  Division_W  NewLeague_A  \
Player                                                                       
-Alan Ashby             0.0       1.0         0.0         1.0          0.0   
-Alvin Davis            1.0       0.0         0.0         1.0          1.0   
-Andre Dawson           0.0       1.0         1.0         0.0          0.0   
-Andres Galarraga       0.0       1.0         1.0         0.0          0.0   
-Alfredo Griffin        1.0       0.0         0.0         1.0          1.0   

                   NewLeague_N  
Player                          
-Alan Ashby                1.0  
-Alvin Davis               0.0  
-Andre Dawson              1.0  
-Andres Galarraga          1.0  
-Alfredo Griffin           0.0  

In [6]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()


<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 19 columns):
AtBat          263 non-null float64
Hits           263 non-null float64
HmRun          263 non-null float64
Runs           263 non-null float64
RBI            263 non-null float64
Walks          263 non-null float64
Years          263 non-null float64
CAtBat         263 non-null float64
CHits          263 non-null float64
CHmRun         263 non-null float64
CRuns          263 non-null float64
CRBI           263 non-null float64
CWalks         263 non-null float64
PutOuts        263 non-null float64
Assists        263 non-null float64
Errors         263 non-null float64
League_N       263 non-null float64
Division_W     263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(19)
memory usage: 41.1+ KB

In [7]:
X.head(5)


Out[7]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks PutOuts Assists Errors League_N Division_W NewLeague_N
Player
-Alan Ashby 315.0 81.0 7.0 24.0 38.0 39.0 14.0 3449.0 835.0 69.0 321.0 414.0 375.0 632.0 43.0 10.0 1.0 1.0 1.0
-Alvin Davis 479.0 130.0 18.0 66.0 72.0 76.0 3.0 1624.0 457.0 63.0 224.0 266.0 263.0 880.0 82.0 14.0 0.0 1.0 0.0
-Andre Dawson 496.0 141.0 20.0 65.0 78.0 37.0 11.0 5628.0 1575.0 225.0 828.0 838.0 354.0 200.0 11.0 3.0 1.0 0.0 1.0
-Andres Galarraga 321.0 87.0 10.0 39.0 42.0 30.0 2.0 396.0 101.0 12.0 48.0 46.0 33.0 805.0 40.0 4.0 1.0 0.0 1.0
-Alfredo Griffin 594.0 169.0 4.0 74.0 51.0 35.0 11.0 4408.0 1133.0 19.0 501.0 336.0 194.0 282.0 421.0 25.0 0.0 1.0 0.0

Ridge Regression


In [8]:
alphas = 10**np.linspace(10,-2,100)*0.5

ridge = Ridge()
coefs = []

for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(scale(X), y)
    coefs.append(ridge.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');


The above plot shows that the Ridge coefficients get larger when we decrease lambda.

Exercises

Exercise 1 Plot the LOO risk and the empirical risk as a function of lambda.


In [34]:
alphas = np.linspace(30,1,100)
rcv = RidgeCV(alphas = alphas, store_cv_values=True,normalize=True)
rcv.fit(X,y)
cv_vals = rcv.cv_values_
LOOr = cv_vals.mean(axis=0)

In [37]:
EMPr = []
for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(scale(X), y)
    EMPr.append(emp_risk(X,y,ridge))

In [43]:
plt.plot(alphas,LOOr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('LOO Risk for Ridge');
plt.show()



In [44]:
plt.plot(alphas,EMPr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('Emp Risk for Ridge');
plt.show()


Exercise 2 Implement and test forward stagewise regression (recall that stagewise and stepwise are different).


In [9]:
n,p = X.shape
Xsc = scale(X)
ysc = scale(y)

I'll implement a different variant of forward stagewise, where the correlation updates the current beta vector by adding them.


In [35]:
MSEiter = []
res = ysc
beta = np.zeros(p)
tol = 1e-2
corrmax = 1.
while corrmax > tol:
    res_corr = Xsc.T.dot(scale(res)) / n
    jmax, corrmax = max(enumerate(np.abs(res_corr)), key=lambda x: x[1])
    beta[jmax] = beta[jmax] + res_corr[jmax]
    res = ysc - Xsc.dot(beta)
    MSE = np.sum(res**2.)
    MSEiter.append(MSE)

In [34]:
beta


Out[34]:
array([-0.62212321,  0.68773196,  0.03909558, -0.07212508, -0.02225624,
        0.27844298, -0.09305252, -0.42239296,  0.        ,  0.        ,
        0.86104284,  0.5182802 , -0.44004932,  0.16924078,  0.09478295,
       -0.04392054,  0.06021227, -0.13394125, -0.01738431])

In [38]:
lm = LinearRegression()
lm.fit(Xsc,ysc)


Out[38]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [39]:
lm.coef_


Out[39]:
array([-0.64650293,  0.75030052,  0.08407102, -0.13452771, -0.05995418,
        0.29999074, -0.03707491, -0.86847257,  0.19252781, -0.03149673,
        1.06770954,  0.57897605, -0.47504143,  0.17492395,  0.11933652,
       -0.0492179 ,  0.06940156, -0.12973401, -0.02742594])

In [ ]: