Modified from the github repo: https://github.com/JWarmenhoven/ISLR-python which is based on the book by James et al. Intro to Statistical Learning.
In [2]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression, lars_path, Lasso, LassoCV
%matplotlib inline
In [3]:
n=100
p=1000
X = np.random.randn(n,p)
X = scale(X)
In [74]:
sprob = 0.02
Sbool = np.random.rand(p) < sprob
s = np.sum(Sbool)
print("Number of non-zero's: {}".format(s))
In [75]:
mu = 100.
beta = np.zeros(p)
beta[Sbool] = mu * np.random.randn(s)
In [76]:
eps = np.random.randn(n)
y = X.dot(beta) + eps
In [77]:
larper = lars_path(X,y,method="lasso")
In [81]:
S = set(np.where(Sbool)[0])
In [92]:
for j in S:
_ = plt.plot(larper[0],larper[2][j,:],'r')
for j in set(range(p)) - S:
_ = plt.plot(larper[0],larper[2][j,:],'k',linewidth=.5)
_ = plt.title('Lasso path for simulated data')
_ = plt.xlabel('lambda')
_ = plt.ylabel('Coef')
Let's load the dataset from the previous lab.
In [2]:
# In R, I exported the dataset from package 'ISLR' to a csv file.
df = pd.read_csv('Data/Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()
In [3]:
df.head()
Out[3]:
In [4]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())
In [5]:
y = df.Salary
# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()
In [6]:
X.head(5)
Out[6]:
Exercise Compare the previous methods to the Lasso on this dataset. Tune $\lambda$ and compare the LOO risk to other methods (ridge, forward selection, etc.)