Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.
For Tables reference see http://data8.org/datascience/tables.html
In [1]:
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__
import sys
sys.path.append("..")
from ml_table import ML_Table
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
Out[1]:
In [2]:
raw_auto = ML_Table.read_table("data/Auto.csv")
auto = raw_auto.where(raw_auto['horsepower'] != '?')
auto['horsepower'] = auto.apply(int, 'horsepower')
In [3]:
auto
Out[3]:
In [4]:
auto_poly = auto.poly('mpg', 'horsepower', 2)
In [5]:
# training error
auto.MSE_model('mpg', auto_poly, 'horsepower')
Out[5]:
In [6]:
Table().with_columns([
('degree', range(1,10)),
('Training error', [auto.MSE_model('mpg', auto.poly('mpg', 'horsepower', degree), 'horsepower') for degree in range(1,10)])
]).plot('degree', height=3, width=4)
In [7]:
def poly_cross(tbl, output_label, input_label, max_degree):
train, test = tbl.split(tbl.num_rows//2)
return [test.MSE_model(output_label, train.poly(output_label, input_label, deg), input_label) for deg in range(1,max_degree)
]
In [8]:
poly_cross(auto, 'mpg', 'horsepower', 10)
Out[8]:
In [9]:
auto_poly = Table().with_column('degree', range(1,10))
for k in range(10):
auto_poly = auto_poly.with_column("run "+str(k), poly_cross(auto, 'mpg', 'horsepower', 10))
auto_poly
Out[9]:
In [68]:
auto_poly.plot('degree')
plots.ylim(10,30)
Out[68]:
In [23]:
auto.take(13)
Out[23]:
In [36]:
def LOOCV_poly(tbl, output_label, input_labels, degree):
n = tbl.num_rows
def split(i):
return tbl.exclude(i), tbl.take(i)
MSEs = [test.MSE_model(output_label,
train.poly(output_label, input_labels, degree),
input_labels) for train, test in [split(i) for i in range(n)]
]
return np.sum(MSEs)/n
In [37]:
LOOCV_poly(auto, 'mpg', 'horsepower', 1)
Out[37]:
In [38]:
# This takes a while with 392 * 9 poly fits
auto_poly_loocv = Table().with_column('degree', range(1,10))
auto_poly_loocv['LOOCV'] = [LOOCV_poly(auto, 'mpg', 'horsepower', deg) for deg in auto_poly_loocv['degree'] ]
auto_poly_loocv
Out[38]:
In [70]:
auto_poly_loocv.plot('degree')
In [40]:
auto.num_rows
Out[40]:
In [41]:
auto.take(range(9,13))
Out[41]:
In [63]:
def k_split(tbl, i, k):
n = tbl.num_rows
nk = n//k
fold = range(i*nk, (i+1)*nk)
return tbl.exclude(fold), tbl.take(fold)
def k_fold_poly(tbl, output_label, input_labels, degree, k):
MSEs = [test.MSE_model(output_label,
train.poly(output_label, input_labels, degree),
input_labels) for train, test in [k_split(tbl, i, k) for i in range(k)]
]
return np.sum(MSEs)/k
In [64]:
k_split(auto, 0, 10)
Out[64]:
In [65]:
k_fold_poly(auto, 'mpg', 'horsepower', 1, 10)
Out[65]:
In [66]:
auto_poly_k_fold = Table().with_column('degree', range(1,10))
auto_poly_k_fold['k fold'] = [k_fold_poly(auto, 'mpg', 'horsepower', deg, 10) for deg in auto_poly_k_fold['degree'] ]
auto_poly_k_fold
Out[66]:
In [67]:
auto_poly_k_fold.plot('degree')
In [100]:
n = 400
eps = 0.1
test2 = ML_Table.runiform('ix', n)
test2['iy'] = np.random.rand(n)
test2['Cat'] = test2.apply(lambda x, y: 'A' if x+y <0 else 'B', ['ix', 'iy'])
test2['Class A'] = test2.apply(lambda x: 1 if x=='A' else 0, 'Cat')
test2['x'] = test2['ix'] + eps*np.random.normal(size=n)
test2['y'] = test2['iy'] + eps*np.random.normal(size=n)
In [101]:
logit2d = test2.logit_regression('Class A', ['x', 'y'])
model_2d = logit2d.model
ax = test2.plot_cut_2d('Class A', 'x', 'y', model_2d, n_grid=50)
test2.classification_error_model('Class A', model_2d, ['x', 'y'])
Out[101]:
In [102]:
train, test = test2.split(n//2)
classifier = train.logit_regression('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
Out[102]:
In [103]:
train, test = test2.split(n//2)
classifier = train.knn_regression('Class A', ['x', 'y'], n_neighbors=5)
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
Out[103]:
In [104]:
train, test = test2.split(n//2)
classifier = train.LDA('Class A', ['x', 'y'])
ax = test.plot_cut_2d('Class A', 'x', 'y', classifier.model, n_grid=50)
test.classification_error_model('Class A', classifier.model, ['x', 'y'])
Out[104]:
In [116]:
def k_split(tbl, i, k):
n = tbl.num_rows
nk = n//k
fold = range(i*nk, (i+1)*nk)
return tbl.exclude(fold), tbl.take(fold)
def k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs):
train, test = k_split(tbl, i, k)
return test.classification_error_model(output_label,
classifier(train, output_label, input_labels, **kwargs).model,
input_labels)
def k_fold(k, tbl, classifier, output_label, input_labels, **kwargs):
return [k_error(i, k, tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]
In [119]:
k_error(0, 10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
Out[119]:
In [120]:
k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
Out[120]:
In [121]:
k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])
Out[121]:
In [122]:
k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
Out[122]:
In [123]:
[np.mean(k_fold(10, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])),
np.mean(k_fold(10, test2, ML_Table.LDA, 'Class A', ['x', 'y']))]
Out[123]:
In [138]:
def boot(tbl, classifier, output_label, input_labels, **kwargs):
test = tbl.sample(tbl.num_rows, with_replacement=True)
return test.classification_error_model(output_label,
classifier(test, output_label, input_labels, **kwargs).model,
input_labels)
def bootstrap_classifier(k, tbl, classifier, output_label, input_labels, **kwargs):
return [boot(tbl, classifier, output_label, input_labels, **kwargs) for i in range(k)]
In [139]:
boot(test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
Out[139]:
In [142]:
test2_classifiers = Table()
test2_classifiers['Logit Error'] = bootstrap_classifier(100, test2, ML_Table.logit_regression, 'Class A', ['x', 'y'])
test2_classifiers['LDA Error'] = bootstrap_classifier(100, test2, ML_Table.LDA, 'Class A', ['x', 'y'])
test2_classifiers['KNN Error'] = bootstrap_classifier(100, test2, ML_Table.knn_regression, 'Class A', ['x', 'y'])
test2_classifiers.hist()
In [143]:
test2_classifiers.stats(ops=[min, np.mean, np.median, max])
Out[143]:
In [ ]: