notebook.community

Edit and run



In [23]:

    
import matplotlib
%matplotlib inline
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, save
from clean_data import (get_models, predict, explain_model, LATEST_DATA as results_2016, get_df, 
                        get_features, regression_target, get_training_data, predict_winner, predict_scores)
import numpy as np
import numpy.random as nr
from scipy.special import logit, expit
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
pd.options.display.float_format = '{:,.1f}'.format

output_notebook()
WIDTH = 800
HEIGHT = 768









    





    
        
        Loading BokehJS ...



In [2]:

    
reg, clf = get_models(cv=False)
reg_cv, clf_cv = get_models(cv=True)









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)



In [26]:

    
def plot_scores(reg, n=1000):
    sub = get_training_data(results_2016).sample(n=n)
    df_in = get_features(sub)
    preds = reg.predict(df_in)
    sub['predicted'] = preds[:, 0]
    sub['predicted_first'] = preds[:, 0].round().astype(int)
    sub['predicted_second'] = preds[:, 1].round().astype(int)


    source = ColumnDataSource(data=sub)

    dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Actual Score', y_axis_label='Predicted Score')
    dot.circle(x='score_first', y='predicted', size=15, fill_alpha=0.3, source=source)
    min_coord = max(sub.score_first.min(), sub.predicted.min())
    max_coord = min(sub.score_first.max(), sub.predicted.max())
    dot.segment(min_coord, min_coord, max_coord, max_coord, line_width=5, color='black', line_cap="round")

    dot.xaxis.major_label_text_font_size = "20pt"
    dot.yaxis.major_label_text_font_size = "20pt"
    dot.xaxis.axis_label_text_font_size="20pt"
    dot.yaxis.axis_label_text_font_size="20pt"
    
    save(dot, 'scores.html')
    show(dot)



In [28]:

    
def plot_residuals(reg):
    sub = get_training_data(results_2016)
    df_in = get_features(sub)
    preds = reg.predict(df_in)
    sub['error'] = preds[:, 0] - sub['score_first']


    hist, edges = np.histogram(sub.error, density=True, bins=50)
    bars = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Error', y_axis_label='')
    bars.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
    
    bars.xaxis.major_label_text_font_size = "20pt"
    bars.xaxis.axis_label_text_font_size="20pt"
    bars.yaxis.visible = False
    save(bars, 'residuals.html')

    show(bars)



In [40]:

    
def plot_logit():
    x = np.linspace(1e-3, 1-1e-3, 100)
    y = logit(x)
    p1 = figure(title="", tools="")
    p1.line(x, y)
    p1.segment(x, 0, x, y)
    save(p1, 'logit.html')

    show(p1)



In [39]:

    
def plot_sigmoid():
    x = np.linspace(-4, 4, 100)
    y = expit(x)
    p1 = figure(title="", tools="")
    p1.line(x, y)
    p1.segment(0, y, x, y)
    save(p1, 'sigmoid.html')

    show(p1)



In [31]:

    
def plot_overfitting(line=False):
    nr.seed(42)
    x = nr.random(30)
    y = 2 * x + 0.1 * nr.randn(x.shape[0])
    dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='x', y_axis_label='y', x_range=(0, 1), y_range=(-0.5, 2.5))
    dot.circle(x=x, y=y, size=15, fill_alpha=0.3)
    if line:
        pipe = make_pipeline(PolynomialFeatures(degree=25), LinearRegression(fit_intercept=False))
        model = pipe.fit(np.atleast_2d(x).T, y)
        new_x = np.linspace(0, 1, 1000)
        preds = model.predict(np.atleast_2d(new_x).T)
        dot.line(x=new_x, y=preds, color='red')
    save(dot, 'overfitting.html')

    show(dot)



In [32]:

    
def plot_orings(line=False):
    df = pd.read_csv('orings.csv')
    df.Temperature += nr.randn(df.Temperature.size)
    dot = figure(title="O-Ring Failures", tools="", toolbar_location=None, width=WIDTH,
                 x_axis_label='Temperature', y_axis_label='O-Ring Problems')
    dot.circle(x='Temperature', y='Failure', size=15, fill_alpha=0.3, source=df)
    save(dot, 'orings.html')

    show(dot)

Machine Learning

and

Probabilistic Programming

Colin Carroll, Kensho

Raw Data



In [9]:

    
get_df(2016).head()









    Out[9]:






  
    
      
      ast
      blk
      day_num
      dr
      fga
      fga3
      fgm
      fgm3
      fta
      ftm
      ...
      num_ot
      or
      pf
      score
      season
      stl
      team
      team_name
      to
      won
    
  
  
    
      0
      13
      6
      11
      28
      57
      17
      29
      4
      27
      15
      ...
      0
      10
      21
      77
      2016
      11
      1104
      Alabama
      12
      True
    
    
      1
      6
      4
      11
      27
      55
      19
      19
      7
      26
      19
      ...
      0
      12
      25
      64
      2016
      7
      1244
      Kennesaw
      16
      False
    
    
      2
      10
      1
      11
      23
      64
      29
      25
      8
      17
      10
      ...
      1
      14
      25
      68
      2016
      12
      1105
      Alabama A&M
      15
      True
    
    
      3
      11
      7
      11
      30
      57
      27
      22
      7
      26
      16
      ...
      1
      18
      21
      67
      2016
      6
      1408
      Tulane
      19
      False
    
    
      4
      12
      6
      11
      34
      61
      20
      24
      6
      32
      25
      ...
      0
      17
      21
      79
      2016
      4
      1112
      Arizona
      14
      True
    
  

5 rows × 22 columns

Model



In [10]:

    
predict(reg, clf, 'North Carolina', 'Connecticut')









    



North Carolina has a 80% chance of beating Connecticut
Predicted Score:
	North Carolina 75 Connecticut 66

What is going on here?

Two models:
- Classification
- Regression



In [11]:

    
predict_winner(clf, 'North Carolina', 'Connecticut')









    Out[11]:





'North Carolina has a 80% chance of beating Connecticut'



In [12]:

    
predict_scores(reg, 'North Carolina', 'Connecticut')









    Out[12]:





'North Carolina 75 Connecticut 66'

Transform raw data to features
Train a model
Measure how accurate you expect the model to be

Turning raw data into features

Surprisingly hard not to peek at the future.



In [13]:

    
get_features(get_training_data(results_2016)).head()









    Out[13]:






  
    
      
      avg_score_first
      avg_score_second
      avg_or_first
      avg_or_second
      avg_dr_first
      avg_dr_second
      avg_to_first
      avg_to_second
      avg_stl_first
      avg_stl_second
      home_game_first
      home_game_second
      logit_fg_pct_first
      logit_fg_pct_second
      logit_fg3_pct_first
      logit_fg3_pct_second
      logit_ft_pct_first
      logit_ft_pct_second
      logit_avg_won_first
      logit_avg_won_second
    
  
  
    
      0
      85.6
      61.3
      12.7
      14.4
      29.1
      25.4
      12.6
      16.5
      5.9
      5.1
      True
      False
      -0.2
      -0.6
      -0.4
      -0.8
      1.1
      0.6
      0.3
      -13.8
    
    
      1
      63.5
      75.9
      7.4
      7.5
      22.1
      28.0
      12.8
      13.6
      5.5
      5.2
      True
      False
      -0.3
      0.0
      -0.6
      -0.5
      1.1
      0.9
      -0.2
      0.9
    
    
      2
      54.8
      70.3
      9.6
      12.7
      20.6
      24.6
      16.4
      14.3
      7.7
      4.3
      False
      True
      -0.5
      -0.2
      -0.8
      -0.7
      0.4
      0.5
      -1.4
      -0.5
    
    
      3
      69.5
      71.5
      13.3
      10.3
      23.8
      25.0
      12.2
      11.2
      5.9
      7.5
      True
      False
      -0.4
      -0.2
      -0.8
      -0.6
      0.8
      0.7
      -0.8
      0.6
    
    
      4
      67.1
      65.2
      15.1
      9.0
      21.6
      22.8
      13.9
      12.7
      8.4
      6.2
      False
      True
      -0.3
      -0.3
      -0.6
      -0.8
      0.6
      0.6
      -0.2
      -0.6

Nonlinear models

Your features may depend nonlinearly on the raw data!



In [41]:

    
plot_logit()









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Turning features into a model

When using linear regression we assume that $$ \mathbf{score} = w_1 \cdot \mathbf{avg\_score} + w_2 \cdot \mathbf{fg\_pct} + \cdots + w_m \cdot \mathbf{win\_pct} $$

Try to find $(w_1, w_2, \ldots, w_m)$.

More concisely

Try to find a $\mathbf{w}$ satisfying $X\mathbf{w} = y$.



In [15]:

    
explain_model(reg)









    



predicted_score_first = 
	+0.51 * avg_score_first
	+0.74 * avg_score_second
	+0.24 * logit_fg_pct_first
	-0.75 * logit_fg_pct_second
	-0.03 * logit_fg3_pct_first
	-0.80 * logit_fg3_pct_second
	-0.32 * logit_ft_pct_first
	+0.98 * logit_ft_pct_second
	+0.20 * avg_or_first
	-0.94 * avg_or_second
	+2.93 * avg_dr_first
	-1.21 * avg_dr_second
	+4.13 * avg_to_first
	-24.27 * avg_to_second
	+0.03 * avg_stl_first
	-1.31 * avg_stl_second
	+0.13 * logit_avg_won_first
	-4.15 * logit_avg_won_second
	+0.08 * home_game_first
	-0.24 * home_game_second

What can we say about linear regression?

Linear regression minimizes the sum of squared errors

$$\sum (\mathbf{x}_j \cdot \mathbf{w} - y_j)^2$$

Linear regression finds the most likely weights

Given our data, $(X, \mathbf{y})$, Bayes Rule says that $$ P(\mathbf{w} | X, \mathbf{y}) = \frac{P(X, \mathbf{y} | \mathbf{w}) p(\mathbf{w})}{P(X, \mathbf{y})} $$

Linear regression is geometrically pleasant

(and syntactically terrifying)

$X\mathbf{w}$ is the nearest point to $\mathbf{y}$ in the $m$-dimensional subspace of $\mathbb{R}^n$ spanned by the columns of $X$.

More guarantees*:

If there is no noise, the true $\mathbf{w}$ will be recovered
$\mathbf{w}$ is unique
$\mathbf{w}$ exists

(*not actually guaranteed)

Evaluating Fit



In [27]:

    
plot_scores(reg, 1000)









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

How wrong will I be?



In [34]:

    
plot_residuals(reg)









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Cross validation, testing, overfitting...



In [35]:

    
plot_overfitting()









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")



In [36]:

    
plot_overfitting(line=True)









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Logistic regression, briefly



In [20]:

    
predict_winner(clf, 'North Carolina', 'Connecticut')









    Out[20]:





'North Carolina has a 80% chance of beating Connecticut'

Instead of $$ \mathbf{y} = X\mathbf{w}, $$

$$ \mathbf{y} = \sigma \left(X\mathbf{w}\right) $$

What is $\sigma$?

$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$



In [42]:

    
plot_sigmoid()









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

Challenger Disaster

Challenger dataset



In [38]:

    
plot_orings()









    



/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/colin/.venv/ai_talk_code/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")



In [ ]:

	ast	blk	day_num	dr	fga	fga3	fgm	fgm3	fta	ftm	...	num_ot	or	pf	score	season	stl	team	team_name	to	won
0	13	6	11	28	57	17	29	4	27	15	...	0	10	21	77	2016	11	1104	Alabama	12	True
1	6	4	11	27	55	19	19	7	26	19	...	0	12	25	64	2016	7	1244	Kennesaw	16	False
2	10	1	11	23	64	29	25	8	17	10	...	1	14	25	68	2016	12	1105	Alabama A&M	15	True
3	11	7	11	30	57	27	22	7	26	16	...	1	18	21	67	2016	6	1408	Tulane	19	False
4	12	6	11	34	61	20	24	6	32	25	...	0	17	21	79	2016	4	1112	Arizona	14	True

	avg_score_first	avg_score_second	avg_or_first	avg_or_second	avg_dr_first	avg_dr_second	avg_to_first	avg_to_second	avg_stl_first	avg_stl_second	home_game_first	home_game_second	logit_fg_pct_first	logit_fg_pct_second	logit_fg3_pct_first	logit_fg3_pct_second	logit_ft_pct_first	logit_ft_pct_second	logit_avg_won_first	logit_avg_won_second
0	85.6	61.3	12.7	14.4	29.1	25.4	12.6	16.5	5.9	5.1	True	False	-0.2	-0.6	-0.4	-0.8	1.1	0.6	0.3	-13.8
1	63.5	75.9	7.4	7.5	22.1	28.0	12.8	13.6	5.5	5.2	True	False	-0.3	0.0	-0.6	-0.5	1.1	0.9	-0.2	0.9
2	54.8	70.3	9.6	12.7	20.6	24.6	16.4	14.3	7.7	4.3	False	True	-0.5	-0.2	-0.8	-0.7	0.4	0.5	-1.4	-0.5
3	69.5	71.5	13.3	10.3	23.8	25.0	12.2	11.2	5.9	7.5	True	False	-0.4	-0.2	-0.8	-0.6	0.8	0.7	-0.8	0.6
4	67.1	65.2	15.1	9.0	21.6	22.8	13.9	12.7	8.4	6.2	False	True	-0.3	-0.3	-0.6	-0.8	0.6	0.6	-0.2	-0.6