Chapter 2

Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.

For Tables reference see http://data8.org/datascience/tables.html

http://jeffskinnerbox.me/notebooks/matplotlib-2d-and-3d-plotting-in-ipython.html


In [1]:
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=2, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__


import sys
sys.path.append("..")
from ml_table import ML_Table

import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )


Out[1]:
'en_US.UTF-8'

Acquiring and seeing trends in multidimensional data


In [2]:
advertising = ML_Table.read_table("./data/Advertising.csv")
advertising.relabel(0, "id")


Out[2]:
id TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
3 17.2 45.9 69.3 9.3
4 151.5 41.3 58.5 18.5
5 180.8 10.8 58.4 12.9
6 8.7 48.9 75 7.2
7 57.5 32.8 23.5 11.8
8 120.2 19.6 11.6 13.2
9 8.6 2.1 1 4.8
10 199.8 2.6 21.2 10.6

... (190 rows omitted)

Figure 2.1


In [3]:
for col in ['TV', 'Radio', 'Newspaper']:
    advertising.select([col, 'Sales']).scatter(col, fit_line=True)


Model and irreducible error


In [4]:
# Create synthetic data with output value variying around a
# model given by a function by a noise term
def simdata(n, f, eps, low, high):
    x_tbl = ML_Table().sequence('x', n, low, high)
    s_tbl = ML_Table().with_column('i', range(n))
    s_tbl['x'] = x_tbl.sample()['x']+ (np.random.rand(n)-0.5)/(2*n)                
    s_tbl['f'] = s_tbl.apply(f, 'x')
    s_tbl['Y'] = s_tbl['f'] + eps*(np.random.rand(n)-0.5)
    return s_tbl

In [5]:
def sigmoid(x):
    return x/((1+x**2)**0.5)
plots.plot(np.arange(-2,2, .1), list(map(sigmoid, np.arange(-2,2, .1))))


Out[5]:
[<matplotlib.lines.Line2D at 0x1150d4d68>]

Figure 2.2


In [6]:
# Complication function modeling how income varies with years of education
def edfun(year):
    x = (year-15)/3
    return (sigmoid(x)+1)*35+20

In [7]:
ed_data = simdata(40, edfun, 10, 10, 22).select(['x', 'Y'])
ed_data.relabel('x', 'Years of Education')
ed_data.relabel('Y', 'Income')
ed_data


Out[7]:
Years of Education Income
11.2037 23.7493
19.0028 86.1182
17.7953 82.5638
15.1029 55.0108
17.1989 79.4269
13.3033 42.3989
12.7023 30.5353
14.8014 50.4985
10.9032 25.3556
14.1967 45.4556

... (30 rows omitted)


In [8]:
# What wold this data look like?
ed_data.scatter('Years of Education')



In [9]:
# Visualize the noise relative to the underlying model
ed_data.plot_fit('Income', edfun)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x114d55908>

Bias - Variance Tradeoff


In [10]:
# Fit a linear model and plot the goodness of fit
ed_data.plot_fit('Income', ed_data.linear_regression('Income').model)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x115909d68>

In [11]:
def show_mse_fit(deg):
    model = ed_data.poly('Income', 'Years of Education', deg)
    ed_data.plot_fit('Income', model)
    data_mse = ed_data.MSE('Income', ed_data.apply(model, 'Years of Education'))
    true_mse = ed_data.MSE(ed_data.apply(edfun, 'Years of Education'), ed_data.apply(model, 'Years of Education'))
    return data_mse, true_mse

In [12]:
show_mse_fit(2)


Out[12]:
(32.65937083625095, 18.097232880882125)

In [13]:
interact(show_mse_fit, deg=widgets.IntSlider(min=1,max=20,step=1,value=1))


(43.624516686884078, 24.393502485472233)
Out[13]:
<function __main__.show_mse_fit>

Multidimensional model and data


In [14]:
def simdata2D(n, f, eps, low, high):
    x_tbl = ML_Table().sequence('x', n, low, high)
    s_tbl = ML_Table().with_column('i', range(n))
    s_tbl['x'] = x_tbl.sample()['x']+ (np.random.rand(n)-0.5)/(2*n) 
    s_tbl['y'] = x_tbl.sample()['x']+ (np.random.rand(n)-0.5)/(2*n)    
    s_tbl['f'] = s_tbl.apply(f, ['x', 'y'])
    s_tbl['Z'] = s_tbl['f'] + eps*(np.random.rand(n)-0.5)
    return s_tbl

In [15]:
# Complication function modeling how income varies with years of education
def edfun2(year, seniority):
    # year in [10, 22], seniority in [0-30]
    x = (year-16)/3
    y = (seniority-15)/7.5
    return (sigmoid(x)+1)*35+20 + (sigmoid(y)+1)*25+20

In [16]:
def show_income_per_ed(ed_years):
    ts = ML_Table().with_column('Seniority', np.arange(0,30))
    ts['Education'] = ed_years
    ts['Income'] = ts.apply(edfun2, ['Education', 'Seniority'])
    ts.scatter('Seniority')
    
_ = interact(show_income_per_ed, ed_years=widgets.IntSlider(min=10,max=22,step=1,value=10))



In [17]:
# Create some two variable data using this model
n = 50
eps = .2
year_tbl = ML_Table().sequence('x', n, 10, 22)
seniority_tbl = ML_Table().sequence('x', n, 1, 20)
sy_table = ML_Table().with_columns(['Years of Education',
                                year_tbl.sample()['x'],
                                'Seniority',
                                seniority_tbl.sample()['x']])
sy_table['Ideal Income'] = sy_table.apply(edfun2, ['Years of Education', 'Seniority'])
sy_table['Income'] = sy_table['Ideal Income']*(1 + 2*eps*(np.random.rand(n)-0.5))
sy_table


Out[17]:
Years of Education Seniority Ideal Income Income
21.52 5.94 111.494 127.272
19.6 16.96 133.209 126.16
21.76 14.3 128.719 136.152
21.04 15.06 130.275 142.387
13.6 7.08 59.9832 48.887
12.16 14.68 71.3534 61.1289
19.84 6.32 108.664 116.376
11.44 10.88 58.7237 50.7375
12.88 5.18 54.9027 47.1641
18.64 6.7 104.573 121.806

... (40 rows omitted)


In [18]:
# Where are the sample points in the input space?
sy_table.select(['Years of Education', 'Seniority']).scatter('Seniority')



In [19]:
# Looking at only one input, either just the model (ideal) or what a sample miight produce
sy_table.select(['Income', 'Ideal Income', 'Years of Education']).scatter('Years of Education', fit_line=True)



In [20]:
# The data in 3D
sy_table.plot_fit_2d("Income", "Years of Education", "Seniority", width=10, height=6)


Out[20]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x115cae0f0>

Figure 2.3


In [21]:
# The data and its relation to the model
sy_table.plot_fit_2d("Income", "Years of Education", "Seniority", edfun2, width=10,height=6)


Out[21]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x115cc5a20>

Expected value of squared difference of actual and expected value: $$E(Y - \hat{Y})^2 = E[ f(X) + \epsilon - \hat{f}(X) ]^2$$ $$ = [f(X) - \hat{f}(X)]^2 + Var(\epsilon)$$

Figure 2.4

Linear model fit to the (simulated) education data in sy_table.


In [22]:
sim_ed = sy_table.drop('Ideal Income')
sim_lin_model = sim_ed.linear_regression('Income').model
sim_ed.plot_fit_2d('Income', 'Years of Education', 'Seniority', sim_lin_model, width=10, height=6)


Out[22]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x117211080>

In [23]:
sim_ed["F^"] = sim_ed.apply(sim_lin_model, ['Years of Education', 'Seniority'])
sim_ed.MSE('Income', 'F^')


Out[23]:
97.741710431862558

In [24]:
sim_ed["Model"] = sim_ed.apply(edfun2, ['Years of Education', 'Seniority'])
sim_ed.MSE('Income', 'Model')


Out[24]:
101.89761602094578

In [25]:
sim_ridge_model = sim_ed.select(['Years of Education', 'Seniority', 'Income']).ridge_regression('Income').model
sim_ed.plot_fit_2d('Income', 'Years of Education', 'Seniority', sim_ridge_model , width=10, height=6)


Out[25]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x117407320>

In [ ]: