In [1]:
import numpy as np
import pandas as pd
# set matplotlib inline so pymc3 plays nicely with jupyter
%matplotlib inline
import pymc3 as pm
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
data_df = pd.read_csv('../datasets/TherapeuticTouchData.csv')
data_df.head(5)
Out[2]:
In [3]:
data = data_df.groupby('s')['y'].apply(list)
data
Out[3]:
In [4]:
with pm.Model() as model:
# define priors the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', 1.105125 , 0.1051249, transform=None)
step = pm.Metropolis()
trace = pm.sample(500, step=step, njobs=4, progressbar=False) # burn in
trace = pm.sample(50000, step=step, start=trace[-1], njobs=4)
In [5]:
axs = pm.traceplot(trace, figsize=(10,4), varnames=['omega', 'kappa'])
In [6]:
axs = pm.plot_posterior(trace, figsize=(10,4), color='cornflowerblue', varnames=['omega', 'kappa'])
In [7]:
z_observ = data.apply(sum).values
N_observ = data.apply(len).values
with pm.Model() as model:
# define priors on the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', 1.105125 , 0.1051249, transform=None)
# parameters for Beta distribution (the prior on subject's theta)
alpha = omega*kappa + 1
beta = (1-omega)*kappa + 1
# define the prior on theta
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=len(z_observ), transform=None)
# define the likelihood
z = pm.Binomial('z', n=N_observ, p=theta, observed=z_observ)
# use Metropolis sampling as it's the fastest among the other 2
step = pm.Metropolis(tune_interval=1000)
# step = pm.Slice()
# step = None
trace = pm.sample(2000, step=step, njobs=4) # burn in
trace = pm.sample(200000, step=step, start=trace[-1], njobs=4)
In [8]:
axs = pm.traceplot(trace, figsize=(10,4), varnames=['omega', 'kappa'])
In [9]:
axs = pm.plot_posterior(trace, figsize=(10,4), color='cornflowerblue', varnames=['omega', 'kappa'])
In [10]:
trace_df = pm.trace_to_dataframe(trace)
trace_df = trace_df[['theta__0', 'theta__13', 'theta__27']]
axs = pd.scatter_matrix(trace_df, figsize=(10,10), diagonal='hist',
hist_kwds={'bins':30, 'normed':True, 'color':'cornflowerblue'});
ax = axs[0,0]
ax.vlines(np.array(data['S01']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[1,1]
ax.vlines(np.array(data['S14']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[2,2]
ax.vlines(np.array(data['S28']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
axs[1,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,1].plot([0,1], [0,1], color='red', linestyle='--')
plt.show()
In [11]:
f, axs = plt.subplots(1, 3, figsize=(15,5))
ax = axs[0]
pm.plot_posterior(trace_df['theta__0'].values - trace_df['theta__13'].values,
ax=ax, color='cornflowerblue')
ax.vlines(0, ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax.set_title('theta[0]-theta[13]')
ax = axs[1]
pm.plot_posterior(trace_df['theta__0'].values - trace_df['theta__27'].values,
ax=ax, color='cornflowerblue')
ax.vlines(0, ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax.set_title('theta[0]-theta[27]')
ax = axs[2]
pm.plot_posterior(trace_df['theta__13'].values - trace_df['theta__27'].values,
ax=ax, color='cornflowerblue')
ax.vlines(0, ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax.set_title('theta[13]-theta[27]')
plt.show()
In [12]:
data_df = pd.read_csv('../datasets/BattingAverage.csv')
data_df.head(5)
Out[12]:
In [13]:
print('number of players: %d' % data_df.shape[0])
In [14]:
# pymc3 can't handle this dataset, so reduce number of samples
data_df = data_df.head(5)
# convert categorical positions to numerical values
positions, pos_encoding = np.unique(data_df['PriPos'], return_inverse=True)
data_df['PriPos'] = pos_encoding
with pm.Model() as model:
# define priors on the top level parameters
omegaO = pm.Beta('omegaO', 1, 1, transform=None)
kappaO = pm.Gamma('kappaO', 0.01 , 0.01, transform=None)
# define categories' variables
alphaO = omegaO*kappaO + 1
betaO = (1-omegaO)*kappaO + 1
n_cat = len(data_df['PriPos'].unique())
omega = pm.Beta('omega', alphaO, betaO, shape=n_cat, transform=None)
kappa = pm.Gamma('kappa', 0.01 , 0.01, shape=n_cat, transform=None)
# define subjects' variables
for i, row in data_df[['PriPos', 'Hits', 'AtBats']].iterrows():
s = row['PriPos']
N_observ = row['AtBats']
z_observ = row['Hits']
alpha = omega[s]*kappa[s] + 1
beta = (1-omega[s])*kappa[s] + 1
theta_name = 'theta_%d' % (i+1)
theta = pm.Beta(theta_name, alpha, beta, transform=None)
z_name = 'z_%d' % (i+1)
z = pm.Binomial(z_name, n=N_observ, p=theta, observed=z_observ)
# use Metropolis sampling as it's the fastest among the other 2
step = pm.Metropolis(tune_interval=1000)
# step = pm.Slice()
# step = None
trace = pm.sample(2000, step=step, njobs=4) # burn in
trace = pm.sample(50000, step=step, start=trace[-1], njobs=4)
In [15]:
axs = pm.traceplot(trace, figsize=(10,4), varnames=['omegaO', 'kappaO'])
^^ It does not seem like pymc3 converged the chains
Note: Unfortunately, pymc3 could not handle even a significantly reduced dataset of baseball players. In the following sections there is the same model implemented in pystan
In [16]:
import pystan as ps
import seaborn as sns
from dbda2e_utils import plotPost
In [17]:
# reload data
data_df = pd.read_csv('../datasets/BattingAverage.csv')
# convert categorical positions to numerical values
positions, pos_encoding = np.unique(data_df['PriPos'], return_inverse=True)
data_df['PriPos'] = pos_encoding
In [18]:
model_code = """
data {
int<lower=0> n_cat;
int<lower=0> n_subj;
int<lower=0> categories[n_subj];
int<lower=0> observations[n_subj];
int<lower=0> successes[n_subj];
}
parameters {
real<lower=0,upper=1> omegaO;
real<lower=0.01> kappaO;
real<lower=0,upper=1> omegas[n_cat];
real<lower=0.01> kappas[n_cat];
real<lower=0,upper=1> thetas[n_subj];
}
transformed parameters {
real alphaO;
real betaO;
real alphas[n_cat];
real betas[n_cat];
alphaO = omegaO*kappaO + 1;
betaO = (1-omegaO)*kappaO + 1;
for (c in 1:n_cat) {
alphas[c] = omegas[c]*kappas[c] + 1;
betas[c] = (1-omegas[c])*kappas[c] + 1;
}
}
model {
int cat;
omegaO ~ beta(1, 1);
kappaO ~ gamma(0.01 , 0.01);
for (c in 1:n_cat) {
omegas[c] ~ beta(alphaO, betaO);
kappas[c] ~ gamma(0.01 , 0.01);
}
for (s in 1:n_subj) {
cat = categories[s] + 1; // indexing starts with 1
thetas[s] ~ beta(alphas[cat], betas[cat]);
successes[s] ~ binomial(observations[s], thetas[s]);
}
}
"""
data = {
'n_cat': len(data_df['PriPos'].unique()),
'n_subj': data_df.shape[0],
'categories': data_df['PriPos'].values,
'observations': data_df['AtBats'].values,
'successes': data_df['Hits'].values
}
fit = ps.stan(model_code=model_code, data=data, iter=11000, warmup=1000, chains=4)
In [19]:
samples = fit.extract(permuted=False, inc_warmup=False)
samples.shape
Out[19]:
In [20]:
# Check MCMC conversion
f, ax = plt.subplots(1,2,figsize=(10,5))
for chain_id in range(4):
smpl = samples[:, chain_id, fit.flatnames.index('omegaO')]
label = 'chain %d' % chain_id
sns.kdeplot(smpl, label=label, ax=ax[0])
ax[0].set_title('omegaO chains')
for chain_id in range(4):
smpl = samples[:, chain_id, fit.flatnames.index('kappaO')]
label = 'chain %d' % chain_id
sns.kdeplot(smpl, label=label, ax=ax[1])
ax[1].set_title('kappaO chains')
plt.legend()
plt.show()
In [21]:
# convert to DataFrame
param_samples = samples[:, :, :-1] # exclude log prob (the last on the params dimension)
n_samples, n_chains, n_params = param_samples.shape
param_samples = param_samples.reshape((n_samples*n_chains, n_params)) # join all chains
samples_df = pd.DataFrame(param_samples.reshape((n_samples*n_chains, n_params)), columns=fit.flatnames)
In [22]:
def plot_mcmc_posterior(data, titles, xlabels):
points1, points2 = data
title1, title2 = titles
xlabel1, xlabel2 = xlabels
f, axs = plt.subplots(2,2,figsize=(10,10))
# marginal posterior for param1
plotPost(points1, axs[0,0], title=title1, xlabel=xlabel1)
# marginal posterios for param1 - param2
ax = axs[0,1]
title = '%s - %s' % (title1, title2)
xlabel = '%s - %s' % (xlabel1, xlabel2)
plotPost(points1-points2, ax, title=title, xlabel=xlabel1)
ax.vlines(0, ax.get_ylim()[0], ax.get_ylim()[1], color='r', linestyle='--')
# marginal posterior for 2
plotPost(points2, axs[1,1], title=title2, xlabel=xlabel2)
ax = axs[1,0]
ax.scatter(points1, points2)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ax.plot([0,1], [0,1], color='red', linestyle='--')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xlabel(title1)
ax.set_ylabel(title2)
return
In [23]:
def analyze_pairwise(param1, param2, param_type):
if param_type == 'positions':
param_name1 = '%s[%d]' % ('omegas', positions.tolist().index(param1))
param_name2 = '%s[%d]' % ('omegas', positions.tolist().index(param2))
else:
# players
param_name1 = '%s[%d]' % ('thetas', data_df['Player'].tolist().index(param1))
param_name2 = '%s[%d]' % ('thetas', data_df['Player'].tolist().index(param2))
plot_mcmc_posterior(data=[samples_df[param_name1], samples_df[param_name2]],
titles=[param1, param2],
xlabels=[param_name1, param_name2])
In [24]:
analyze_pairwise('Pitcher', 'Catcher', 'positions')
In [25]:
analyze_pairwise('Catcher', '1st Base', 'positions')
In [26]:
analyze_pairwise('Kyle Blanks', 'Bruce Chen', 'players')
In [27]:
analyze_pairwise('ShinSoo Choo', 'Ichiro Suzuki', 'players')
In [28]:
analyze_pairwise('Mike Leake', 'Wandy Rodriguez', 'players')
In [29]:
analyze_pairwise('Andrew McCutchen', 'Brett Jackson', 'players')
In [30]:
myu = 1 # mean
omega = 1 # mode
std = 10 # standard deviation
In [31]:
shape1 = myu * myu / (std * std)
rate1 = myu / (std * std)
shape1, rate1
Out[31]:
In [32]:
rate2 = (omega + np.sqrt(omega * omega + 4 * std * std)) / (2 * std * std)
shape2 = 1 + omega * rate2
shape2, rate2
Out[32]:
In [33]:
from scipy.stats import gamma as gammad
def gamma_(x, shape, rate):
rv = gammad(shape, scale = 1.0/rate)
return rv.pdf(x)
In [34]:
f, axs = plt.subplots(2, 1, figsize=(5,5))
x = np.linspace(0, 90, num=100)
ax = axs[0]
ax.plot(x, gamma_(x, shape1, rate1))
ax.plot(x, gamma_(x, shape2, rate2))
ax = axs[1]
ax.plot(x, gamma_(x, shape1, rate1))
ax.plot(x, gamma_(x, shape2, rate2))
ax.set_ylim([0,0.001])
plt.show()
In [35]:
data_df = pd.read_csv('../datasets/TherapeuticTouchData.csv')
data = data_df.groupby('s')['y'].apply(list)
z_observ = data.apply(sum).values
N_observ = data.apply(len).values
with pm.Model() as model:
# define priors on the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', shape1 , rate1, transform=None)
# parameters for Beta distribution (the prior on subject's theta)
alpha = omega*kappa + 1
beta = (1-omega)*kappa + 1
# define the prior on theta
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=len(z_observ), transform=None)
# define the likelihood
z = pm.Binomial('z', n=N_observ, p=theta, observed=z_observ)
step = None
trace = pm.sample(2000, step=step, njobs=4) # burn in
trace = pm.sample(20000, step=step, start=trace[-1], njobs=4)
In [36]:
axs = pm.plot_posterior(trace, figsize=(10,4), color='cornflowerblue', varnames=['omega', 'kappa'])
axs[1].set_xlim([0,250])
Out[36]:
In [37]:
data_df = pd.read_csv('../datasets/TherapeuticTouchData.csv')
data = data_df.groupby('s')['y'].apply(list)
z_observ = data.apply(sum).values
N_observ = data.apply(len).values
with pm.Model() as model:
# define priors on the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', shape2 , rate2, transform=None)
# parameters for Beta distribution (the prior on subject's theta)
alpha = omega*kappa + 1
beta = (1-omega)*kappa + 1
# define the prior on theta
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=len(z_observ), transform=None)
# define the likelihood
z = pm.Binomial('z', n=N_observ, p=theta, observed=z_observ)
step = None
trace = pm.sample(2000, step=step, njobs=4) # burn in
trace = pm.sample(20000, step=step, start=trace[-1], njobs=4)
In [38]:
axs = pm.plot_posterior(trace, figsize=(10,4), color='cornflowerblue', varnames=['omega', 'kappa'])
axs[1].set_xlim([0,100])
Out[38]:
In [39]:
data_df = pd.read_csv('../datasets/TherapeuticTouchData.csv')
data = data_df.groupby('s')['y'].apply(list)
z_observ = data.apply(sum).values
N_observ = data.apply(len).values
with pm.Model() as model:
# define priors on the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', shape1 , rate1, transform=None)
# parameters for Beta distribution (the prior on subject's theta)
alpha = omega*kappa + 1
beta = (1-omega)*kappa + 1
# define the prior on theta
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=len(z_observ), transform=None)
# define the likelihood
# z = pm.Binomial('z', n=N_observ, p=theta, observed=z_observ)
step = None
trace = pm.sample(2000, step=step) # burn in
trace = pm.sample(20000, step=step, start=trace[-1])
In [40]:
trace_df = pm.trace_to_dataframe(trace)
trace_df = trace_df[['theta__0', 'theta__13', 'theta__27']]
axs = pd.scatter_matrix(trace_df, figsize=(10,10), diagonal='hist',
hist_kwds={'bins':30, 'normed':True, 'color':'cornflowerblue'});
ax = axs[0,0]
ax.vlines(np.array(data['S01']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[1,1]
ax.vlines(np.array(data['S14']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[2,2]
ax.vlines(np.array(data['S28']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
axs[1,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,1].plot([0,1], [0,1], color='red', linestyle='--')
plt.show()
In [41]:
data_df = pd.read_csv('../datasets/TherapeuticTouchData.csv')
data = data_df.groupby('s')['y'].apply(list)
z_observ = data.apply(sum).values
N_observ = data.apply(len).values
with pm.Model() as model:
# define priors on the top level parameters
omega = pm.Beta('omega', 1, 1, transform=None)
kappa = pm.Gamma('kappa', shape2 , rate2, transform=None)
# parameters for Beta distribution (the prior on subject's theta)
alpha = omega*kappa + 1
beta = (1-omega)*kappa + 1
# define the prior on theta
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=len(z_observ), transform=None)
# define the likelihood
# z = pm.Binomial('z', n=N_observ, p=theta, observed=z_observ)
step = None
trace = pm.sample(2000, step=step) # burn in
trace = pm.sample(20000, step=step, start=trace[-1])
In [42]:
trace_df = pm.trace_to_dataframe(trace)
trace_df = trace_df[['theta__0', 'theta__13', 'theta__27']]
axs = pd.scatter_matrix(trace_df, figsize=(10,10), diagonal='hist',
hist_kwds={'bins':30, 'normed':True, 'color':'cornflowerblue'});
ax = axs[0,0]
ax.vlines(np.array(data['S01']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[1,1]
ax.vlines(np.array(data['S14']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
ax = axs[2,2]
ax.vlines(np.array(data['S28']).mean(), ax.get_ylim()[0], ax.get_ylim()[1], color='r')
axs[1,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,0].plot([0,1], [0,1], color='red', linestyle='--')
axs[2,1].plot([0,1], [0,1], color='red', linestyle='--')
plt.show()
In [ ]: