Modified from the github repo: https://github.com/JWarmenhoven/ISLR-python which is based on the book by James et al. Intro to Statistical Learning.
In [2]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
%matplotlib inline
plt.style.use('ggplot')
datafolder = "../data/"
def loo_risk(X,y,regmod):
"""
Construct the leave-one-out square error risk for a regression model
Input: design matrix, X, response vector, y, a regression model, regmod
Output: scalar LOO risk
"""
loo = LeaveOneOut()
loo_losses = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
regmod.fit(X_train,y_train)
y_hat = regmod.predict(X_test)
loss = np.sum((y_hat - y_test)**2)
loo_losses.append(loss)
return np.mean(loo_losses)
def emp_risk(X,y,regmod):
"""
Return the empirical risk for square error loss
Input: design matrix, X, response vector, y, a regression model, regmod
Output: scalar empirical risk
"""
regmod.fit(X,y)
y_hat = regmod.predict(X)
return np.mean((y_hat - y)**2)
In [3]:
# In R, I exported the dataset from package 'ISLR' to a csv file.
df = pd.read_csv(datafolder+'Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()
In [4]:
df.head()
Out[4]:
In [5]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())
In [6]:
y = df.Salary
# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()
In [7]:
X.head(5)
Out[7]:
In [8]:
alphas = 10**np.linspace(10,-2,100)*0.5
ridge = Ridge()
coefs = []
for a in alphas:
ridge.set_params(alpha=a)
ridge.fit(scale(X), y)
coefs.append(ridge.coef_)
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.axis('tight')
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');
The above plot shows that the Ridge coefficients get larger when we decrease lambda.
Exercise 1 Plot the LOO risk and the empirical risk as a function of lambda.
In [34]:
alphas = np.linspace(30,1,100)
rcv = RidgeCV(alphas = alphas, store_cv_values=True,normalize=True)
rcv.fit(X,y)
cv_vals = rcv.cv_values_
LOOr = cv_vals.mean(axis=0)
In [37]:
EMPr = []
for a in alphas:
ridge.set_params(alpha=a)
ridge.fit(scale(X), y)
EMPr.append(emp_risk(X,y,ridge))
In [43]:
plt.plot(alphas,LOOr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('LOO Risk for Ridge');
plt.show()
In [44]:
plt.plot(alphas,EMPr)
plt.xlabel('lambda')
plt.ylabel('Risk')
plt.title('Emp Risk for Ridge');
plt.show()
Exercise 2 Implement and test forward stagewise regression (recall that stagewise and stepwise are different).
In [9]:
n,p = X.shape
Xsc = scale(X)
ysc = scale(y)
I'll implement a different variant of forward stagewise, where the correlation updates the current beta vector by adding them.
In [35]:
MSEiter = []
res = ysc
beta = np.zeros(p)
tol = 1e-2
corrmax = 1.
while corrmax > tol:
res_corr = Xsc.T.dot(scale(res)) / n
jmax, corrmax = max(enumerate(np.abs(res_corr)), key=lambda x: x[1])
beta[jmax] = beta[jmax] + res_corr[jmax]
res = ysc - Xsc.dot(beta)
MSE = np.sum(res**2.)
MSEiter.append(MSE)
In [34]:
beta
Out[34]:
In [38]:
lm = LinearRegression()
lm.fit(Xsc,ysc)
Out[38]:
In [39]:
lm.coef_
Out[39]:
In [ ]: