Chapter 5 - Resampling methods

Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.

For Tables reference see http://data8.org/datascience/tables.html


In [1]:
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__


import sys
sys.path.append("..")
from ml_table import ML_Table

import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )


Out[1]:
'en_US.UTF-8'

5.1 Cross validation

5.1.1 The Validation Set Approach

Train on a random half the data, test on the other half.


In [2]:
raw_auto = ML_Table.read_table("data/Auto.csv")
auto = raw_auto.where(raw_auto['horsepower'] != '?')
auto['horsepower'] = auto.apply(int, 'horsepower')

In [3]:
auto


Out[3]:
mpg cylinders displacement horsepower weight acceleration year origin name
18 8 307 130 3,504 12 70 1 chevrolet chevelle malibu
15 8 350 165 3,693 11.5 70 1 buick skylark 320
18 8 318 150 3,436 11 70 1 plymouth satellite
16 8 304 150 3,433 12 70 1 amc rebel sst
17 8 302 140 3,449 10.5 70 1 ford torino
15 8 429 198 4,341 10 70 1 ford galaxie 500
14 8 454 220 4,354 9 70 1 chevrolet impala
14 8 440 215 4,312 8.5 70 1 plymouth fury iii
14 8 455 225 4,425 10 70 1 pontiac catalina
15 8 390 190 3,850 8.5 70 1 amc ambassador dpl

... (382 rows omitted)


In [4]:
auto_poly = auto.poly('mpg', 'horsepower', 2)

In [5]:
# training error

auto.MSE_model('mpg', auto_poly, 'horsepower')


Out[5]:
18.984768907617219

In [6]:
Table().with_columns([
    ('degree', range(1,10)),
    ('Training error', [auto.MSE_model('mpg', auto.poly('mpg', 'horsepower', degree), 'horsepower') for degree in range(1,10)])
]).plot('degree', height=3, width=4)



In [7]:
def poly_cross(tbl, output_label, input_label, max_degree):
    train, test = tbl.split(tbl.num_rows//2)
    return [test.MSE_model(output_label, train.poly(output_label, input_label, deg), input_label) for deg in range(1,max_degree)
           ]

In [8]:
poly_cross(auto, 'mpg', 'horsepower', 10)


Out[8]:
[22.153907306205539,
 18.143980839929544,
 18.280279403692781,
 18.194886137328933,
 18.275325687861066,
 18.213835217121328,
 18.017134265766948,
 18.020639985782477,
 18.352172677432851]

In [9]:
auto_poly = Table().with_column('degree', range(1,10))
for k in range(10):
    auto_poly = auto_poly.with_column("run "+str(k), poly_cross(auto, 'mpg', 'horsepower', 10))
auto_poly


Out[9]:
degree run 0 run 1 run 2 run 3 run 4 run 5 run 6 run 7 run 8 run 9
1 24.799 23.466 22.4868 23.7759 25.478 25.8399 24.3811 24.2911 23.5398 22.1828
2 20.075 20.4249 18.1171 17.2199 19.6713 19.1799 19.7684 18.433 21.0931 16.5412
3 20.2118 21.5568 18.0704 17.1147 19.7064 19.2911 19.7174 18.5302 22.0849 16.4889
4 20.2583 21.5555 18.0927 18.941 19.7881 20.202 19.6307 18.8266 22.7447 16.5492
5 19.6832 21.012 18.2573 17.935 19.84 19.3934 19.0186 18.2932 22.5502 15.8957
6 19.6378 20.711 18.323 19.372 19.7009 20.1134 18.8055 18.5826 22.4656 16.142
7 19.5533 20.2027 18.6918 18.5706 19.5829 19.1824 18.5362 18.0202 22.0488 16.2082
8 21.1997 22.8126 19.6871 17.8499 19.8478 23.1287 18.5616 18.076 22.4275 17.3321
9 25.0362 23.2401 20.4802 68.0624 20.1855 24.3317 19.0554 18.1496 22.8569 18.3696

In [68]:
auto_poly.plot('degree')
plots.ylim(10,30)


Out[68]:
(10, 30)

Leave out one cross valudation - LOOCV

$CV_n = \frac{1}{n} \sum_{i=1}^{n}MSE_i$

where $MSE_i$ is the test error on the $i$-th row when trained on the remainin $n-1$ rows. It is slow.


In [23]:
auto.take(13)


Out[23]:
mpg cylinders displacement horsepower weight acceleration year origin name
14 8 455 225 3,086 10 70 1 buick estate wagon (sw)

In [36]:
def LOOCV_poly(tbl, output_label, input_labels, degree):
    n = tbl.num_rows
    def split(i):
        return tbl.exclude(i), tbl.take(i)
    MSEs = [test.MSE_model(output_label, 
                           train.poly(output_label, input_labels, degree),
                           input_labels) for train, test in [split(i) for i in range(n)]
            ]
    return np.sum(MSEs)/n

In [37]:
LOOCV_poly(auto, 'mpg', 'horsepower', 1)


Out[37]:
24.231513517929233

In [38]:
# This takes a while with 392 * 9 poly fits
auto_poly_loocv = Table().with_column('degree', range(1,10))
auto_poly_loocv['LOOCV'] = [LOOCV_poly(auto, 'mpg', 'horsepower', deg) for deg in auto_poly_loocv['degree'] ]
auto_poly_loocv


Out[38]:
degree LOOCV
1 24.2315
2 19.2482
3 19.335
4 19.4244
5 19.0332
6 18.9786
7 18.833
8 18.9612
9 19.0686

In [70]:
auto_poly_loocv.plot('degree')



In [40]:
auto.num_rows


Out[40]:
392

k-fold cross validation

$CV_k = \frac{1}{k} \sum_{i=1}^{k}MSE_i$

where the data is broken into $k$ groups, trained on $k-1$ of them and tested on the remaining.


In [41]:
auto.take(range(9,13))


Out[41]:
mpg cylinders displacement horsepower weight acceleration year origin name
15 8 390 190 3,850 8.5 70 1 amc ambassador dpl
15 8 383 170 3,563 10 70 1 dodge challenger se
14 8 340 160 3,609 8 70 1 plymouth 'cuda 340
15 8 400 150 3,761 9.5 70 1 chevrolet monte carlo

In [63]:
def k_split(tbl, i, k):
    n = tbl.num_rows
    nk = n//k
    fold = range(i*nk, (i+1)*nk)
    return tbl.exclude(fold), tbl.take(fold)

def k_fold_poly(tbl, output_label, input_labels, degree, k):
    MSEs = [test.MSE_model(output_label, 
                           train.poly(output_label, input_labels, degree),
                           input_labels) for train, test in [k_split(tbl, i, k) for i in range(k)]
            ]
    return np.sum(MSEs)/k

In [64]:
k_split(auto, 0, 10)


Out[64]:
(mpg  | cylinders | displacement | horsepower | weight | acceleration | year | origin | name
 14   | 8         | 351          | 153        | 4,154  | 13.5         | 71   | 1      | ford galaxie 500
 14   | 8         | 318          | 150        | 4,096  | 13           | 71   | 1      | plymouth fury iii
 12   | 8         | 383          | 180        | 4,955  | 11.5         | 71   | 1      | dodge monaco (sw)
 13   | 8         | 400          | 170        | 4,746  | 12           | 71   | 1      | ford country squire (sw)
 13   | 8         | 400          | 175        | 5,140  | 12           | 71   | 1      | pontiac safari (sw)
 18   | 6         | 258          | 110        | 2,962  | 13.5         | 71   | 1      | amc hornet sportabout (sw)
 22   | 4         | 140          | 72         | 2,408  | 19           | 71   | 1      | chevrolet vega (sw)
 19   | 6         | 250          | 100        | 3,282  | 15           | 71   | 1      | pontiac firebird
 18   | 6         | 250          | 88         | 3,139  | 14.5         | 71   | 1      | ford mustang
 23   | 4         | 122          | 86         | 2,220  | 14           | 71   | 1      | mercury capri 2000
 ... (343 rows omitted),
 mpg  | cylinders | displacement | horsepower | weight | acceleration | year | origin | name
 18   | 8         | 307          | 130        | 3,504  | 12           | 70   | 1      | chevrolet chevelle malibu
 15   | 8         | 350          | 165        | 3,693  | 11.5         | 70   | 1      | buick skylark 320
 18   | 8         | 318          | 150        | 3,436  | 11           | 70   | 1      | plymouth satellite
 16   | 8         | 304          | 150        | 3,433  | 12           | 70   | 1      | amc rebel sst
 17   | 8         | 302          | 140        | 3,449  | 10.5         | 70   | 1      | ford torino
 15   | 8         | 429          | 198        | 4,341  | 10           | 70   | 1      | ford galaxie 500
 14   | 8         | 454          | 220        | 4,354  | 9            | 70   | 1      | chevrolet impala
 14   | 8         | 440          | 215        | 4,312  | 8.5          | 70   | 1      | plymouth fury iii
 14   | 8         | 455          | 225        | 4,425  | 10           | 70   | 1      | pontiac catalina
 15   | 8         | 390          | 190        | 3,850  | 8.5          | 70   | 1      | amc ambassador dpl
 ... (29 rows omitted))

In [65]:
k_fold_poly(auto, 'mpg', 'horsepower', 1, 10)


Out[65]:
27.453742881045894

In [66]:
auto_poly_k_fold = Table().with_column('degree', range(1,10))
auto_poly_k_fold['k fold'] = [k_fold_poly(auto, 'mpg', 'horsepower', deg, 10) for deg in auto_poly_k_fold['degree'] ]
auto_poly_k_fold


Out[66]:
degree k fold
1 27.4537
2 21.2252
3 21.3465
4 21.3772
5 20.943
6 20.8014
7 20.6286
8 20.9376
9 20.8231

In [67]:
auto_poly_k_fold.plot('degree')


Cross validation with classification


In [100]:
n = 400
eps = 0.1
test2 = ML_Table.runiform('ix', n)
test2['iy'] = np.random.rand(n)
test2['Cat'] = test2.apply(lambda x, y: 'A' if x+y <0 else 'B', ['ix', 'iy'])
test2['Class A'] = test2.apply(lambda x: 1 if x=='A' else 0, 'Cat')
test2['x'] = test2['ix'] + eps*np.random.normal(size=n)
test2['y'] = test2['iy'] + eps*np.random.normal(size=n)

In [101]:
logit2d = test2.logit_regression('Class A', ['x', 'y'])
model_2d = logit2d.model
ax = test2.plot_cut_2d('Class A', 'x', 'y', model_2d, n_grid=50)
test2.classification_error_model('Class A', model_2d, ['x', 'y'])


Out[101]:
0.0425

In [102]:
train, test = test2.split(n//2)
classifier = train.logit_regression('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])


Out[102]:
0.035

In [103]:
train, test = test2.split(n//2)
classifier = train.knn_regression('Class A', ['x', 'y'], n_neighbors=5)
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])


Out[103]:
0.14

In [104]:
train, test = test2.split(n//2)
classifier = train.LDA('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])


Out[104]:
0.045

K-Fold Cross Validation


In [116]:
def k_split(tbl, i, k):
    n = tbl.num_rows
    nk = n//k
    fold = range(i*nk, (i+1)*nk)
    return tbl.exclude(fold), tbl.take(fold)

def k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs):
    train, test = k_split(tbl, i, k)
    return test.classification_error_model(output_label, 
                                           classifier(train, output_label, input_labels, **kwargs).model,
                                           input_labels)

def k_fold(k, tbl, classifier, output_label, input_labels, **kwargs):
    return [k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]

In [119]:
k_error(0, 10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])


Out[119]:
0.05

In [120]:
k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])


Out[120]:
[0.05, 0.05, 0.025, 0.025, 0.075, 0.025, 0.075, 0.05, 0.025, 0.0]

In [121]:
k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])


Out[121]:
[0.075, 0.175, 0.075, 0.075, 0.1, 0.15, 0.2, 0.15, 0.125, 0.075]

In [122]:
k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])


Out[122]:
[0.05, 0.075, 0.025, 0.025, 0.05, 0.0, 0.05, 0.025, 0.0, 0.05]

In [123]:
[np.mean(k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y']))]


Out[123]:
[0.040000000000000001, 0.12, 0.034999999999999996]

Bootstrap

Using the bootstrap, we can compare the accuracy of our various classifiers


In [138]:
def boot(tbl, classifier, output_label, input_labels, **kwargs):
    test = tbl.sample(tbl.num_rows, with_replacement=True)
    return test.classification_error_model(output_label, 
                                           classifier(test, output_label, input_labels, **kwargs).model,
                                           input_labels)

def bootstrap_classifier(k, tbl, classifier, output_label, input_labels, **kwargs):
    return [boot(tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]

In [139]:
boot(test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])


Out[139]:
0.0525

In [142]:
test2_classifiers = Table()
test2_classifiers['Logit Error'] = bootstrap_classifier(100, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
test2_classifiers['LDA Error'] = bootstrap_classifier(100, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
test2_classifiers['KNN Error'] = bootstrap_classifier(100, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])
test2_classifiers.hist()



In [143]:
test2_classifiers.stats(ops=[min, np.mean, np.median, max])


Out[143]:
statistic Logit Error LDA Error KNN Error
min 0.02 0.015 0.04
mean 0.037925 0.034625 0.073375
median 0.0375 0.03375 0.0725
max 0.065 0.0625 0.115

In [ ]: