In [23]:
import matplotlib
%matplotlib inline
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, save
from clean_data import (get_models, predict, explain_model, LATEST_DATA as results_2016, get_df,
get_features, regression_target, get_training_data, predict_winner, predict_scores)
import numpy as np
import numpy.random as nr
from scipy.special import logit, expit
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
pd.options.display.float_format = '{:,.1f}'.format
output_notebook()
WIDTH = 800
HEIGHT = 768
In [2]:
reg, clf = get_models(cv=False)
reg_cv, clf_cv = get_models(cv=True)
In [26]:
def plot_scores(reg, n=1000):
sub = get_training_data(results_2016).sample(n=n)
df_in = get_features(sub)
preds = reg.predict(df_in)
sub['predicted'] = preds[:, 0]
sub['predicted_first'] = preds[:, 0].round().astype(int)
sub['predicted_second'] = preds[:, 1].round().astype(int)
source = ColumnDataSource(data=sub)
dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Actual Score', y_axis_label='Predicted Score')
dot.circle(x='score_first', y='predicted', size=15, fill_alpha=0.3, source=source)
min_coord = max(sub.score_first.min(), sub.predicted.min())
max_coord = min(sub.score_first.max(), sub.predicted.max())
dot.segment(min_coord, min_coord, max_coord, max_coord, line_width=5, color='black', line_cap="round")
dot.xaxis.major_label_text_font_size = "20pt"
dot.yaxis.major_label_text_font_size = "20pt"
dot.xaxis.axis_label_text_font_size="20pt"
dot.yaxis.axis_label_text_font_size="20pt"
save(dot, 'scores.html')
show(dot)
In [28]:
def plot_residuals(reg):
sub = get_training_data(results_2016)
df_in = get_features(sub)
preds = reg.predict(df_in)
sub['error'] = preds[:, 0] - sub['score_first']
hist, edges = np.histogram(sub.error, density=True, bins=50)
bars = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Error', y_axis_label='')
bars.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
bars.xaxis.major_label_text_font_size = "20pt"
bars.xaxis.axis_label_text_font_size="20pt"
bars.yaxis.visible = False
save(bars, 'residuals.html')
show(bars)
In [40]:
def plot_logit():
x = np.linspace(1e-3, 1-1e-3, 100)
y = logit(x)
p1 = figure(title="", tools="")
p1.line(x, y)
p1.segment(x, 0, x, y)
save(p1, 'logit.html')
show(p1)
In [39]:
def plot_sigmoid():
x = np.linspace(-4, 4, 100)
y = expit(x)
p1 = figure(title="", tools="")
p1.line(x, y)
p1.segment(0, y, x, y)
save(p1, 'sigmoid.html')
show(p1)
In [31]:
def plot_overfitting(line=False):
nr.seed(42)
x = nr.random(30)
y = 2 * x + 0.1 * nr.randn(x.shape[0])
dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='x', y_axis_label='y', x_range=(0, 1), y_range=(-0.5, 2.5))
dot.circle(x=x, y=y, size=15, fill_alpha=0.3)
if line:
pipe = make_pipeline(PolynomialFeatures(degree=25), LinearRegression(fit_intercept=False))
model = pipe.fit(np.atleast_2d(x).T, y)
new_x = np.linspace(0, 1, 1000)
preds = model.predict(np.atleast_2d(new_x).T)
dot.line(x=new_x, y=preds, color='red')
save(dot, 'overfitting.html')
show(dot)
In [32]:
def plot_orings(line=False):
df = pd.read_csv('orings.csv')
df.Temperature += nr.randn(df.Temperature.size)
dot = figure(title="O-Ring Failures", tools="", toolbar_location=None, width=WIDTH,
x_axis_label='Temperature', y_axis_label='O-Ring Problems')
dot.circle(x='Temperature', y='Failure', size=15, fill_alpha=0.3, source=df)
save(dot, 'orings.html')
show(dot)
In [9]:
get_df(2016).head()
Out[9]:
In [10]:
predict(reg, clf, 'North Carolina', 'Connecticut')
In [11]:
predict_winner(clf, 'North Carolina', 'Connecticut')
Out[11]:
In [12]:
predict_scores(reg, 'North Carolina', 'Connecticut')
Out[12]:
In [13]:
get_features(get_training_data(results_2016)).head()
Out[13]:
In [41]:
plot_logit()
In [15]:
explain_model(reg)
In [27]:
plot_scores(reg, 1000)
In [34]:
plot_residuals(reg)
In [35]:
plot_overfitting()
In [36]:
plot_overfitting(line=True)
In [20]:
predict_winner(clf, 'North Carolina', 'Connecticut')
Out[20]:
Instead of $$ \mathbf{y} = X\mathbf{w}, $$
$$ \mathbf{y} = \sigma \left(X\mathbf{w}\right) $$
In [42]:
plot_sigmoid()
In [38]:
plot_orings()
In [ ]: