In [23]:
import matplotlib
%matplotlib inline
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, save
from clean_data import (get_models, predict, explain_model, LATEST_DATA as results_2016, get_df, 
                        get_features, regression_target, get_training_data, predict_winner, predict_scores)
import numpy as np
import numpy.random as nr
from scipy.special import logit, expit
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
pd.options.display.float_format = '{:,.1f}'.format

output_notebook()
WIDTH = 800
HEIGHT = 768


Loading BokehJS ...

In [2]:
reg, clf = get_models(cv=False)
reg_cv, clf_cv = get_models(cv=True)


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [26]:
def plot_scores(reg, n=1000):
    sub = get_training_data(results_2016).sample(n=n)
    df_in = get_features(sub)
    preds = reg.predict(df_in)
    sub['predicted'] = preds[:, 0]
    sub['predicted_first'] = preds[:, 0].round().astype(int)
    sub['predicted_second'] = preds[:, 1].round().astype(int)


    source = ColumnDataSource(data=sub)

    dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Actual Score', y_axis_label='Predicted Score')
    dot.circle(x='score_first', y='predicted', size=15, fill_alpha=0.3, source=source)
    min_coord = max(sub.score_first.min(), sub.predicted.min())
    max_coord = min(sub.score_first.max(), sub.predicted.max())
    dot.segment(min_coord, min_coord, max_coord, max_coord, line_width=5, color='black', line_cap="round")

    dot.xaxis.major_label_text_font_size = "20pt"
    dot.yaxis.major_label_text_font_size = "20pt"
    dot.xaxis.axis_label_text_font_size="20pt"
    dot.yaxis.axis_label_text_font_size="20pt"
    
    save(dot, 'scores.html')
    show(dot)

In [28]:
def plot_residuals(reg):
    sub = get_training_data(results_2016)
    df_in = get_features(sub)
    preds = reg.predict(df_in)
    sub['error'] = preds[:, 0] - sub['score_first']


    hist, edges = np.histogram(sub.error, density=True, bins=50)
    bars = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Error', y_axis_label='')
    bars.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
    
    bars.xaxis.major_label_text_font_size = "20pt"
    bars.xaxis.axis_label_text_font_size="20pt"
    bars.yaxis.visible = False
    save(bars, 'residuals.html')

    show(bars)

In [40]:
def plot_logit():
    x = np.linspace(1e-3, 1-1e-3, 100)
    y = logit(x)
    p1 = figure(title="", tools="")
    p1.line(x, y)
    p1.segment(x, 0, x, y)
    save(p1, 'logit.html')

    show(p1)

In [39]:
def plot_sigmoid():
    x = np.linspace(-4, 4, 100)
    y = expit(x)
    p1 = figure(title="", tools="")
    p1.line(x, y)
    p1.segment(0, y, x, y)
    save(p1, 'sigmoid.html')

    show(p1)

In [31]:
def plot_overfitting(line=False):
    nr.seed(42)
    x = nr.random(30)
    y = 2 * x + 0.1 * nr.randn(x.shape[0])
    dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='x', y_axis_label='y', x_range=(0, 1), y_range=(-0.5, 2.5))
    dot.circle(x=x, y=y, size=15, fill_alpha=0.3)
    if line:
        pipe = make_pipeline(PolynomialFeatures(degree=25), LinearRegression(fit_intercept=False))
        model = pipe.fit(np.atleast_2d(x).T, y)
        new_x = np.linspace(0, 1, 1000)
        preds = model.predict(np.atleast_2d(new_x).T)
        dot.line(x=new_x, y=preds, color='red')
    save(dot, 'overfitting.html')

    show(dot)

In [32]:
def plot_orings(line=False):
    df = pd.read_csv('orings.csv')
    df.Temperature += nr.randn(df.Temperature.size)
    dot = figure(title="O-Ring Failures", tools="", toolbar_location=None, width=WIDTH,
                 x_axis_label='Temperature', y_axis_label='O-Ring Problems')
    dot.circle(x='Temperature', y='Failure', size=15, fill_alpha=0.3, source=df)
    save(dot, 'orings.html')

    show(dot)

Machine Learning

and

Probabilistic Programming

Colin Carroll, Kensho

Raw Data


In [9]:
get_df(2016).head()


Out[9]:
ast blk day_num dr fga fga3 fgm fgm3 fta ftm ... num_ot or pf score season stl team team_name to won
0 13 6 11 28 57 17 29 4 27 15 ... 0 10 21 77 2016 11 1104 Alabama 12 True
1 6 4 11 27 55 19 19 7 26 19 ... 0 12 25 64 2016 7 1244 Kennesaw 16 False
2 10 1 11 23 64 29 25 8 17 10 ... 1 14 25 68 2016 12 1105 Alabama A&M 15 True
3 11 7 11 30 57 27 22 7 26 16 ... 1 18 21 67 2016 6 1408 Tulane 19 False
4 12 6 11 34 61 20 24 6 32 25 ... 0 17 21 79 2016 4 1112 Arizona 14 True

5 rows × 22 columns

Model


In [10]:
predict(reg, clf, 'North Carolina', 'Connecticut')


North Carolina has a 80% chance of beating Connecticut
Predicted Score:
	North Carolina 75 Connecticut 66

What is going on here?

  • Two models:
    • Classification
    • Regression

In [11]:
predict_winner(clf, 'North Carolina', 'Connecticut')


Out[11]:
'North Carolina has a 80% chance of beating Connecticut'

In [12]:
predict_scores(reg, 'North Carolina', 'Connecticut')


Out[12]:
'North Carolina 75 Connecticut 66'
  • Transform raw data to features
  • Train a model
  • Measure how accurate you expect the model to be

Turning raw data into features

Surprisingly hard not to peek at the future.


In [13]:
get_features(get_training_data(results_2016)).head()


Out[13]:
avg_score_first avg_score_second avg_or_first avg_or_second avg_dr_first avg_dr_second avg_to_first avg_to_second avg_stl_first avg_stl_second home_game_first home_game_second logit_fg_pct_first logit_fg_pct_second logit_fg3_pct_first logit_fg3_pct_second logit_ft_pct_first logit_ft_pct_second logit_avg_won_first logit_avg_won_second
0 85.6 61.3 12.7 14.4 29.1 25.4 12.6 16.5 5.9 5.1 True False -0.2 -0.6 -0.4 -0.8 1.1 0.6 0.3 -13.8
1 63.5 75.9 7.4 7.5 22.1 28.0 12.8 13.6 5.5 5.2 True False -0.3 0.0 -0.6 -0.5 1.1 0.9 -0.2 0.9
2 54.8 70.3 9.6 12.7 20.6 24.6 16.4 14.3 7.7 4.3 False True -0.5 -0.2 -0.8 -0.7 0.4 0.5 -1.4 -0.5
3 69.5 71.5 13.3 10.3 23.8 25.0 12.2 11.2 5.9 7.5 True False -0.4 -0.2 -0.8 -0.6 0.8 0.7 -0.8 0.6
4 67.1 65.2 15.1 9.0 21.6 22.8 13.9 12.7 8.4 6.2 False True -0.3 -0.3 -0.6 -0.8 0.6 0.6 -0.2 -0.6

Nonlinear models

Your features may depend nonlinearly on the raw data!


In [41]:
plot_logit()


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Turning features into a model

When using linear regression we assume that $$ \mathbf{score} = w_1 \cdot \mathbf{avg\_score} + w_2 \cdot \mathbf{fg\_pct} + \cdots + w_m \cdot \mathbf{win\_pct} $$

Try to find $(w_1, w_2, \ldots, w_m)$.

More concisely

Try to find a $\mathbf{w}$ satisfying $X\mathbf{w} = y$.


In [15]:
explain_model(reg)


predicted_score_first = 
	+0.51 * avg_score_first
	+0.74 * avg_score_second
	+0.24 * logit_fg_pct_first
	-0.75 * logit_fg_pct_second
	-0.03 * logit_fg3_pct_first
	-0.80 * logit_fg3_pct_second
	-0.32 * logit_ft_pct_first
	+0.98 * logit_ft_pct_second
	+0.20 * avg_or_first
	-0.94 * avg_or_second
	+2.93 * avg_dr_first
	-1.21 * avg_dr_second
	+4.13 * avg_to_first
	-24.27 * avg_to_second
	+0.03 * avg_stl_first
	-1.31 * avg_stl_second
	+0.13 * logit_avg_won_first
	-4.15 * logit_avg_won_second
	+0.08 * home_game_first
	-0.24 * home_game_second

What can we say about linear regression?

Linear regression minimizes the sum of squared errors

$$\sum (\mathbf{x}_j \cdot \mathbf{w} - y_j)^2$$

Linear regression finds the most likely weights

Given our data, $(X, \mathbf{y})$, Bayes Rule says that $$ P(\mathbf{w} | X, \mathbf{y}) = \frac{P(X, \mathbf{y} | \mathbf{w}) p(\mathbf{w})}{P(X, \mathbf{y})} $$

Linear regression is geometrically pleasant

(and syntactically terrifying)

$X\mathbf{w}$ is the nearest point to $\mathbf{y}$ in the $m$-dimensional subspace of $\mathbb{R}^n$ spanned by the columns of $X$.

More guarantees*:

  • If there is no noise, the true $\mathbf{w}$ will be recovered
  • $\mathbf{w}$ is unique
  • $\mathbf{w}$ exists

(*not actually guaranteed)

Evaluating Fit


In [27]:
plot_scores(reg, 1000)


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

How wrong will I be?


In [34]:
plot_residuals(reg)


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Cross validation, testing, overfitting...


In [35]:
plot_overfitting()


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

In [36]:
plot_overfitting(line=True)


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Logistic regression, briefly


In [20]:
predict_winner(clf, 'North Carolina', 'Connecticut')


Out[20]:
'North Carolina has a 80% chance of beating Connecticut'

Instead of $$ \mathbf{y} = X\mathbf{w}, $$

$$ \mathbf{y} = \sigma \left(X\mathbf{w}\right) $$

What is $\sigma$?

$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$

In [42]:
plot_sigmoid()


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Challenger Disaster

Challenger dataset


In [38]:
plot_orings()


/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

In [ ]: