In [1]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline
# Configure Jupyter to display the assigned value after an assignment
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'
import numpy as np
import pandas as pd
# import classes from thinkbayes2
from thinkbayes2 import Pmf, Cdf, Suite, Joint
import thinkplot
Here's a problem from Bayesian Methods for Hackers
On January 28, 1986, the twenty-fifth flight of the U.S. space shuttle program ended in disaster when one of the rocket boosters of the Shuttle Challenger exploded shortly after lift-off, killing all seven crew members. The presidential commission on the accident concluded that it was caused by the failure of an O-ring in a field joint on the rocket booster, and that this failure was due to a faulty design that made the O-ring unacceptably sensitive to a number of factors including outside temperature. Of the previous 24 flights, data were available on failures of O-rings on 23, (one was lost at sea), and these data were discussed on the evening preceding the Challenger launch, but unfortunately only the data corresponding to the 7 flights on which there was a damage incident were considered important and these were thought to show no obvious trend. The data are shown below (see 1):
In [2]:
# !wget https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv
In [3]:
columns = ['Date', 'Temperature', 'Incident']
df = pd.read_csv('challenger_data.csv', parse_dates=[0])
df.drop(labels=[3, 24], inplace=True)
df
Out[3]:
In [4]:
df['Incident'] = df['Damage Incident'].astype(float)
df
Out[4]:
In [5]:
import matplotlib.pyplot as plt
plt.scatter(df.Temperature, df.Incident, s=75, color="k",
alpha=0.5)
plt.yticks([0, 1])
plt.ylabel("Damage Incident?")
plt.xlabel("Outside temperature (Fahrenheit)")
plt.title("Defects of the Space Shuttle O-Rings vs temperature");
We can solve the problem first using a grid algorithm, with parameters b0
and b1
, and
$\mathrm{logit}(p) = b0 + b1 * T$
and each datum being a temperature T
and a boolean outcome fail
, which is true is there was damage and false otherwise.
Hint: the expit
function from scipy.special
computes the inverse of the logit
function.
In [6]:
from scipy.special import expit
class Logistic(Suite, Joint):
def Likelihood(self, data, hypo):
"""
data: T, fail
hypo: b0, b1
"""
return 1
In [7]:
# Solution
from scipy.special import expit
class Logistic(Suite, Joint):
def Likelihood(self, data, hypo):
"""
data: T, fail
hypo: b0, b1
"""
temp, fail = data
b0, b1 = hypo
log_odds = b0 + b1 * temp
p_fail = expit(log_odds)
if fail == 1:
return p_fail
elif fail == 0:
return 1-p_fail
else:
# NaN
return 1
In [8]:
b0 = np.linspace(0, 50, 101);
In [9]:
b1 = np.linspace(-1, 1, 101);
In [10]:
from itertools import product
hypos = product(b0, b1)
Out[10]:
In [11]:
suite = Logistic(hypos);
In [12]:
for data in zip(df.Temperature, df.Incident):
print(data)
suite.Update(data)
In [13]:
thinkplot.Pdf(suite.Marginal(0))
thinkplot.decorate(xlabel='Intercept',
ylabel='PMF',
title='Posterior marginal distribution')
In [14]:
thinkplot.Pdf(suite.Marginal(1))
thinkplot.decorate(xlabel='Log odds ratio',
ylabel='PMF',
title='Posterior marginal distribution')
According to the posterior distribution, what was the probability of damage when the shuttle launched at 31 degF?
In [15]:
# Solution
T = 31
total = 0
for hypo, p in suite.Items():
b0, b1 = hypo
log_odds = b0 + b1 * T
p_fail = expit(log_odds)
total += p * p_fail
total
Out[15]:
In [16]:
# Solution
pred = suite.Copy()
pred.Update((31, True))
Out[16]:
Implement this model using MCMC. As a starting place, you can use this example from the PyMC3 docs.
As a challege, try writing the model more explicitly, rather than using the GLM module.
In [17]:
from warnings import simplefilter
simplefilter('ignore', FutureWarning)
import pymc3 as pm
In [18]:
# Solution
with pm.Model() as model:
pm.glm.GLM.from_formula('Incident ~ Temperature', df,
family=pm.glm.families.Binomial())
start = pm.find_MAP()
trace = pm.sample(1000, start=start, tune=1000)
In [19]:
pm.traceplot(trace);
In [20]:
# Solution
with pm.Model() as model:
pm.glm.GLM.from_formula('Incident ~ Temperature', df,
family=pm.glm.families.Binomial())
trace = pm.sample(1000, tune=1000)
The posterior distributions for these parameters should be similar to what we got with the grid algorithm.
In [ ]: