```
In [23]:
```import matplotlib
%matplotlib inline
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, save
from clean_data import (get_models, predict, explain_model, LATEST_DATA as results_2016, get_df,
get_features, regression_target, get_training_data, predict_winner, predict_scores)
import numpy as np
import numpy.random as nr
from scipy.special import logit, expit
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
pd.options.display.float_format = '{:,.1f}'.format
output_notebook()
WIDTH = 800
HEIGHT = 768

```
```

```
In [2]:
```reg, clf = get_models(cv=False)
reg_cv, clf_cv = get_models(cv=True)

```
```

```
In [26]:
```def plot_scores(reg, n=1000):
sub = get_training_data(results_2016).sample(n=n)
df_in = get_features(sub)
preds = reg.predict(df_in)
sub['predicted'] = preds[:, 0]
sub['predicted_first'] = preds[:, 0].round().astype(int)
sub['predicted_second'] = preds[:, 1].round().astype(int)
source = ColumnDataSource(data=sub)
dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Actual Score', y_axis_label='Predicted Score')
dot.circle(x='score_first', y='predicted', size=15, fill_alpha=0.3, source=source)
min_coord = max(sub.score_first.min(), sub.predicted.min())
max_coord = min(sub.score_first.max(), sub.predicted.max())
dot.segment(min_coord, min_coord, max_coord, max_coord, line_width=5, color='black', line_cap="round")
dot.xaxis.major_label_text_font_size = "20pt"
dot.yaxis.major_label_text_font_size = "20pt"
dot.xaxis.axis_label_text_font_size="20pt"
dot.yaxis.axis_label_text_font_size="20pt"
save(dot, 'scores.html')
show(dot)

```
In [28]:
```def plot_residuals(reg):
sub = get_training_data(results_2016)
df_in = get_features(sub)
preds = reg.predict(df_in)
sub['error'] = preds[:, 0] - sub['score_first']
hist, edges = np.histogram(sub.error, density=True, bins=50)
bars = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='Error', y_axis_label='')
bars.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
bars.xaxis.major_label_text_font_size = "20pt"
bars.xaxis.axis_label_text_font_size="20pt"
bars.yaxis.visible = False
save(bars, 'residuals.html')
show(bars)

```
In [40]:
```def plot_logit():
x = np.linspace(1e-3, 1-1e-3, 100)
y = logit(x)
p1 = figure(title="", tools="")
p1.line(x, y)
p1.segment(x, 0, x, y)
save(p1, 'logit.html')
show(p1)

```
In [39]:
```def plot_sigmoid():
x = np.linspace(-4, 4, 100)
y = expit(x)
p1 = figure(title="", tools="")
p1.line(x, y)
p1.segment(0, y, x, y)
save(p1, 'sigmoid.html')
show(p1)

```
In [31]:
```def plot_overfitting(line=False):
nr.seed(42)
x = nr.random(30)
y = 2 * x + 0.1 * nr.randn(x.shape[0])
dot = figure(title="", tools="", toolbar_location=None, width=WIDTH, x_axis_label='x', y_axis_label='y', x_range=(0, 1), y_range=(-0.5, 2.5))
dot.circle(x=x, y=y, size=15, fill_alpha=0.3)
if line:
pipe = make_pipeline(PolynomialFeatures(degree=25), LinearRegression(fit_intercept=False))
model = pipe.fit(np.atleast_2d(x).T, y)
new_x = np.linspace(0, 1, 1000)
preds = model.predict(np.atleast_2d(new_x).T)
dot.line(x=new_x, y=preds, color='red')
save(dot, 'overfitting.html')
show(dot)

```
In [32]:
```def plot_orings(line=False):
df = pd.read_csv('orings.csv')
df.Temperature += nr.randn(df.Temperature.size)
dot = figure(title="O-Ring Failures", tools="", toolbar_location=None, width=WIDTH,
x_axis_label='Temperature', y_axis_label='O-Ring Problems')
dot.circle(x='Temperature', y='Failure', size=15, fill_alpha=0.3, source=df)
save(dot, 'orings.html')
show(dot)

```
In [9]:
```get_df(2016).head()

```
Out[9]:
```

```
In [10]:
```predict(reg, clf, 'North Carolina', 'Connecticut')

```
```

- Two models:
**Classification****Regression**

```
In [11]:
```predict_winner(clf, 'North Carolina', 'Connecticut')

```
Out[11]:
```

```
In [12]:
```predict_scores(reg, 'North Carolina', 'Connecticut')

```
Out[12]:
```

- Transform raw data to features
- Train a model
- Measure how accurate you expect the model to be

```
In [13]:
```get_features(get_training_data(results_2016)).head()

```
Out[13]:
```

```
In [41]:
```plot_logit()

```
```

```
In [15]:
```explain_model(reg)

```
```

```
In [27]:
```plot_scores(reg, 1000)

```
```

```
In [34]:
```plot_residuals(reg)

```
```

```
In [35]:
```plot_overfitting()

```
```

```
In [36]:
```plot_overfitting(line=True)

```
```

```
In [20]:
```predict_winner(clf, 'North Carolina', 'Connecticut')

```
Out[20]:
```

Instead of $$ \mathbf{y} = X\mathbf{w}, $$

$$ \mathbf{y} = \sigma \left(X\mathbf{w}\right) $$```
In [42]:
```plot_sigmoid()

```
```

```
In [38]:
```plot_orings()

```
```

```
In [ ]:
```