Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.
For Tables reference see http://data8.org/datascience/tables.html
In [1]:
    
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__
import sys
sys.path.append("..")
from ml_table import ML_Table
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
    
    Out[1]:
In [2]:
    
raw_auto = ML_Table.read_table("data/Auto.csv")
auto = raw_auto.where(raw_auto['horsepower'] != '?')
auto['horsepower'] = auto.apply(int, 'horsepower')
    
In [3]:
    
auto
    
    Out[3]:
In [4]:
    
auto_poly = auto.poly('mpg', 'horsepower', 2)
    
In [5]:
    
# training error
auto.MSE_model('mpg', auto_poly, 'horsepower')
    
    Out[5]:
In [6]:
    
Table().with_columns([
    ('degree', range(1,10)),
    ('Training error', [auto.MSE_model('mpg', auto.poly('mpg', 'horsepower', degree), 'horsepower') for degree in range(1,10)])
]).plot('degree', height=3, width=4)
    
    
In [7]:
    
def poly_cross(tbl, output_label, input_label, max_degree):
    train, test = tbl.split(tbl.num_rows//2)
    return [test.MSE_model(output_label, train.poly(output_label, input_label, deg), input_label) for deg in range(1,max_degree)
           ]
    
In [8]:
    
poly_cross(auto, 'mpg', 'horsepower', 10)
    
    Out[8]:
In [9]:
    
auto_poly = Table().with_column('degree', range(1,10))
for k in range(10):
    auto_poly = auto_poly.with_column("run "+str(k), poly_cross(auto, 'mpg', 'horsepower', 10))
auto_poly
    
    Out[9]:
In [68]:
    
auto_poly.plot('degree')
plots.ylim(10,30)
    
    Out[68]:
    
In [23]:
    
auto.take(13)
    
    Out[23]:
In [36]:
    
def LOOCV_poly(tbl, output_label, input_labels, degree):
    n = tbl.num_rows
    def split(i):
        return tbl.exclude(i), tbl.take(i)
    MSEs = [test.MSE_model(output_label, 
                           train.poly(output_label, input_labels, degree),
                           input_labels) for train, test in [split(i) for i in range(n)]
            ]
    return np.sum(MSEs)/n
    
In [37]:
    
LOOCV_poly(auto, 'mpg', 'horsepower', 1)
    
    Out[37]:
In [38]:
    
# This takes a while with 392 * 9 poly fits
auto_poly_loocv = Table().with_column('degree', range(1,10))
auto_poly_loocv['LOOCV'] = [LOOCV_poly(auto, 'mpg', 'horsepower', deg) for deg in auto_poly_loocv['degree'] ]
auto_poly_loocv
    
    Out[38]:
In [70]:
    
auto_poly_loocv.plot('degree')
    
    
In [40]:
    
auto.num_rows
    
    Out[40]:
In [41]:
    
auto.take(range(9,13))
    
    Out[41]:
In [63]:
    
def k_split(tbl, i, k):
    n = tbl.num_rows
    nk = n//k
    fold = range(i*nk, (i+1)*nk)
    return tbl.exclude(fold), tbl.take(fold)
def k_fold_poly(tbl, output_label, input_labels, degree, k):
    MSEs = [test.MSE_model(output_label, 
                           train.poly(output_label, input_labels, degree),
                           input_labels) for train, test in [k_split(tbl, i, k) for i in range(k)]
            ]
    return np.sum(MSEs)/k
    
In [64]:
    
k_split(auto, 0, 10)
    
    Out[64]:
In [65]:
    
k_fold_poly(auto, 'mpg', 'horsepower', 1, 10)
    
    Out[65]:
In [66]:
    
auto_poly_k_fold = Table().with_column('degree', range(1,10))
auto_poly_k_fold['k fold'] = [k_fold_poly(auto, 'mpg', 'horsepower', deg, 10) for deg in auto_poly_k_fold['degree'] ]
auto_poly_k_fold
    
    Out[66]:
In [67]:
    
auto_poly_k_fold.plot('degree')
    
    
In [100]:
    
n = 400
eps = 0.1
test2 = ML_Table.runiform('ix', n)
test2['iy'] = np.random.rand(n)
test2['Cat'] = test2.apply(lambda x, y: 'A' if x+y <0 else 'B', ['ix', 'iy'])
test2['Class A'] = test2.apply(lambda x: 1 if x=='A' else 0, 'Cat')
test2['x'] = test2['ix'] + eps*np.random.normal(size=n)
test2['y'] = test2['iy'] + eps*np.random.normal(size=n)
    
In [101]:
    
logit2d = test2.logit_regression('Class A', ['x', 'y'])
model_2d = logit2d.model
ax = test2.plot_cut_2d('Class A', 'x', 'y', model_2d, n_grid=50)
test2.classification_error_model('Class A', model_2d, ['x', 'y'])
    
    Out[101]:
    
In [102]:
    
train, test = test2.split(n//2)
classifier = train.logit_regression('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
    
    Out[102]:
    
In [103]:
    
train, test = test2.split(n//2)
classifier = train.knn_regression('Class A', ['x', 'y'], n_neighbors=5)
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
    
    Out[103]:
    
In [104]:
    
train, test = test2.split(n//2)
classifier = train.LDA('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
    
    Out[104]:
    
In [116]:
    
def k_split(tbl, i, k):
    n = tbl.num_rows
    nk = n//k
    fold = range(i*nk, (i+1)*nk)
    return tbl.exclude(fold), tbl.take(fold)
def k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs):
    train, test = k_split(tbl, i, k)
    return test.classification_error_model(output_label, 
                                           classifier(train, output_label, input_labels, **kwargs).model,
                                           input_labels)
def k_fold(k, tbl, classifier, output_label, input_labels, **kwargs):
    return [k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]
    
In [119]:
    
k_error(0, 10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
    
    Out[119]:
In [120]:
    
k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
    
    Out[120]:
In [121]:
    
k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])
    
    Out[121]:
In [122]:
    
k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
    
    Out[122]:
In [123]:
    
[np.mean(k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y']))]
    
    Out[123]:
In [138]:
    
def boot(tbl, classifier, output_label, input_labels, **kwargs):
    test = tbl.sample(tbl.num_rows, with_replacement=True)
    return test.classification_error_model(output_label, 
                                           classifier(test, output_label, input_labels, **kwargs).model,
                                           input_labels)
def bootstrap_classifier(k, tbl, classifier, output_label, input_labels, **kwargs):
    return [boot(tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]
    
In [139]:
    
boot(test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
    
    Out[139]:
In [142]:
    
test2_classifiers = Table()
test2_classifiers['Logit Error'] = bootstrap_classifier(100, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
test2_classifiers['LDA Error'] = bootstrap_classifier(100, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
test2_classifiers['KNN Error'] = bootstrap_classifier(100, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])
test2_classifiers.hist()
    
    
In [143]:
    
test2_classifiers.stats(ops=[min, np.mean, np.median, max])
    
    Out[143]:
In [ ]: