Chapter 5: Cross-Validation and Bootstrap



In [1]:

    
from __future__ import division
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold
from sklearn.cross_validation import Bootstrap
from sklearn.metrics import mean_squared_error
%matplotlib inline



In [2]:

    
auto_df = pd.read_csv("../data/Auto.csv", na_values="?")
auto_df.dropna(inplace=True)
auto_df.head()









    Out[2]:






  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      year
      origin
      name
    
  
  
    
      0
       18
       8
       307
       130
       3504
       12.0
       70
       1
       chevrolet chevelle malibu
    
    
      1
       15
       8
       350
       165
       3693
       11.5
       70
       1
               buick skylark 320
    
    
      2
       18
       8
       318
       150
       3436
       11.0
       70
       1
              plymouth satellite
    
    
      3
       16
       8
       304
       150
       3433
       12.0
       70
       1
                   amc rebel sst
    
    
      4
       17
       8
       302
       140
       3449
       10.5
       70
       1
                     ford torino
    
  

5 rows × 9 columns



In [3]:

    
ax = auto_df.plot(x="horsepower", y="mpg", style="o")
ax.set_ylabel("mpg")









    Out[3]:





<matplotlib.text.Text at 0x57dccd0>

Leave One Out Cross Validation (LOOCV)

Instead of R's glm, we use Scikit-Learn's LinearRegression to arrive at very similar results.



In [4]:

    
clf = LinearRegression()
loo = LeaveOneOut(len(auto_df))
X = auto_df[["horsepower"]].values
y = auto_df["mpg"].values
n = np.shape(X)[0]
mses = []
for train, test in loo:
    Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    mses.append(mean_squared_error(ytest, ypred))
np.mean(mses)









    Out[4]:





24.231513517929265



In [18]:

    
def loo_shortcut(X, y):
    """ implement one-pass LOOCV calculation for linear models from ISLR Page 180 (Eqn 5.2) """
    clf = LinearRegression()
    clf.fit(X, y)
    ypred = clf.predict(X)
    xbar = np.mean(X, axis=0)
    xsum = np.sum(np.power(X - xbar, 2))
    nrows = np.shape(X)[0]
    mses = []
    for row in range(0, nrows):
        hi = (1 / nrows) + (np.sum(X[row] - xbar) ** 2 / xsum)
        mse = (y[row] - ypred[row]) ** 2 / (1 - hi)
        mses.append(mse)
    return np.mean(mses)

loo_shortcut(auto_df[["horsepower"]].values, auto_df["mpg"].values)









    Out[18]:





24.086699242104974



In [6]:

    
# LOOCV against models of different degrees
auto_df["horsepower^2"] = auto_df["horsepower"] * auto_df["horsepower"]
auto_df["horsepower^3"] = auto_df["horsepower^2"] * auto_df["horsepower"]
auto_df["horsepower^4"] = auto_df["horsepower^3"] * auto_df["horsepower"]
auto_df["horsepower^5"] = auto_df["horsepower^4"] * auto_df["horsepower"]
auto_df["unit"] = 1
colnames = ["unit", "horsepower", "horsepower^2", "horsepower^3", "horsepower^4", "horsepower^5"]
cv_errors = []
for ncols in range(2, 6):
    X = auto_df[colnames[0:ncols]]
    y = auto_df["mpg"]
    clf = LinearRegression()
    clf.fit(X, y)
    cv_errors.append(loo_shortcut(X.values, y.values))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")









    Out[6]:





<matplotlib.text.Text at 0x5d52b50>

K-Fold Cross Validation



In [7]:

    
cv_errors = []
for ncols in range(2, 6):
    # each ncol corresponds to a polynomial model
    X = auto_df[colnames[0:ncols]].values
    y = auto_df["mpg"].values
    kfold = KFold(len(auto_df), n_folds=10)
    mses = []
    for train, test in kfold:
        # each model is cross validated 10 times
        Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]   
        clf = LinearRegression()
        clf.fit(X, y)
        ypred = clf.predict(Xtest)
        mses.append(mean_squared_error(ypred, ytest))
    cv_errors.append(np.mean(mses))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")









    Out[7]:





<matplotlib.text.Text at 0x5f7c810>

Bootstrap



In [8]:

    
cv_errors = []
for ncols in range(2, 6):
    # each ncol corresponds to a polynomial model
    X = auto_df[colnames[0:ncols]].values
    y = auto_df["mpg"].values
    n = len(auto_df)
    bs = Bootstrap(n, train_size=int(0.9*n), test_size=int(0.1*n), n_iter=10, random_state=0)
    mses = []
    for train, test in bs:
        # each model is resampled 10 times
        Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]   
        clf = LinearRegression()
        clf.fit(X, y)
        ypred = clf.predict(Xtest)
        mses.append(mean_squared_error(ypred, ytest))
    cv_errors.append(np.mean(mses))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")









    Out[8]:





<matplotlib.text.Text at 0x5f9f810>



In [9]:

    
def alpha(x, y):
    """ allocate alpha of your assets to x and (1-alpha) to y for optimum """
    vx = np.var(x)
    vy = np.var(y)
    cxy = np.cov(x, y)
    return ((vy - cxy) / (vx + vy - 2 * cxy))[0, 1]

# From ISLR package, retrieved with write.csv(Portfolio, "portfolio.csv", row.names=FALSE)
portfolio_df = pd.read_csv("../data/Portfolio.csv")
portfolio_df.head()









    Out[9]:






  
    
      
      X
      Y
    
  
  
    
      0
      -0.895251
      -0.234924
    
    
      1
      -1.562454
      -0.885176
    
    
      2
      -0.417090
       0.271888
    
    
      3
       1.044356
      -0.734198
    
    
      4
      -0.315568
       0.841983
    
  

5 rows × 2 columns



In [10]:

    
alpha(portfolio_df["X"].values, portfolio_df["Y"].values)









    Out[10]:





0.57665115161041181



In [11]:

    
# Find the variance of alpha - shows that bootstrapping results in a near-normal distribution
X = portfolio_df["X"].values
Y = portfolio_df["Y"].values
bs = Bootstrap(len(portfolio_df), n_iter=1000, train_size=99, random_state=0)
alphas = []
for train, test in bs:
    xtrain, ytrain = X[train], Y[train]
    alphas.append(alpha(xtrain, ytrain))
plt.hist(alphas)









    Out[11]:





(array([   1.,    4.,   27.,  101.,  214.,  265.,  231.,  111.,   30.,   16.]),
 array([ 0.23030675,  0.2930067 ,  0.35570664,  0.41840658,  0.48110652,
        0.54380646,  0.6065064 ,  0.66920634,  0.73190628,  0.79460622,
        0.85730616]),
 <a list of 10 Patch objects>)

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18	8	307	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15	8	350	165	3693	11.5	70	1	buick skylark 320
2	18	8	318	150	3436	11.0	70	1	plymouth satellite
3	16	8	304	150	3433	12.0	70	1	amc rebel sst
4	17	8	302	140	3449	10.5	70	1	ford torino

	X	Y
0	-0.895251	-0.234924
1	-1.562454	-0.885176
2	-0.417090	0.271888
3	1.044356	-0.734198
4	-0.315568	0.841983