In [31]:
from __future__ import division
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%precision 4
plt.style.use('ggplot')
In [32]:
np.random.seed(1234)
import pystan
import scipy.stats as stats
In [33]:
import warnings
warnings.simplefilter('ignore')
Install PyStan
with
pip install pystan
The nice thing about PyMC
is that everything is in Python. With PyStan
, however, you need to use a domain specific language based on C++ syntax to specify the model and the data, which is less flexible and more work. However, in exchange you get an extremely powerful HMC package (only does HMC) that can be used in R and Python.
In [34]:
coin_code = """
data {
int<lower=0> n; // number of tosses
int<lower=0> y; // number of heads
}
transformed data {}
parameters {
real<lower=0, upper=1> p;
}
transformed parameters {}
model {
p ~ beta(2, 2);
y ~ binomial(n, p);
}
generated quantities {}
"""
coin_dat = {
'n': 100,
'y': 61,
}
In [35]:
sm = pystan.StanModel(model_code=coin_code)
In [36]:
op = sm.optimizing(data=coin_dat)
op
Out[36]:
In [37]:
fit = sm.sampling(data=coin_dat)
In [38]:
print(fit)
In [39]:
coin_dict = fit.extract()
coin_dict.keys()
# lp_ is the log posterior
Out[39]:
In [40]:
df = pd.DataFrame(coin_dict)
df.head(3)
Out[40]:
In [41]:
fit.plot('p');
plt.tight_layout()
In [42]:
norm_code = """
data {
int<lower=0> n;
real y[n];
}
transformed data {}
parameters {
real<lower=0, upper=100> mu;
real<lower=0, upper=10> sigma;
}
transformed parameters {}
model {
y ~ normal(mu, sigma);
}
generated quantities {}
"""
norm_dat = {
'n': 100,
'y': np.random.normal(10, 2, 100),
}
fit = pystan.stan(model_code=norm_code, data=norm_dat, iter=1000, chains=1)
In [43]:
fit
Out[43]:
In [44]:
trace = fit.extract()
In [45]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1);
plt.hist(trace['mu'][:], 25, histtype='step');
plt.subplot(1,2,2);
plt.hist(trace['sigma'][:], 25, histtype='step');
In [46]:
sm = pystan.StanModel(model_code=norm_code)
op = sm.optimizing(data=norm_dat)
op
Out[46]:
In [47]:
new_dat = {
'n': 100,
'y': np.random.normal(10, 2, 100),
}
In [48]:
fit2 = pystan.stan(fit=fit, data=new_dat, chains=1)
In [49]:
fit2
Out[49]:
In [50]:
def save(obj, filename):
"""Save compiled models for reuse."""
import pickle
with open(filename, 'wb') as f:
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
def load(filename):
"""Reload compiled models for reuse."""
import pickle
return pickle.load(open(filename, 'rb'))
In [51]:
model = pystan.StanModel(model_code=norm_code)
save(model, 'norm_model.pic')
In [52]:
new_model = load('norm_model.pic')
fit4 = new_model.sampling(new_dat, chains=1)
fit4
Out[52]:
We will show how to estimate regression parameters using a simple linear model
$$ y \sim ax + b $$We can restate the linear model $$y = ax + b + \epsilon$$ as sampling from a probability distribution
$$ y \sim \mathcal{N}(ax + b, \sigma^2) $$We will assume the following priors
$$ a \sim \mathcal{N}(0, 100) \\ b \sim \mathcal{N}(0, 100) \\ \sigma \sim \mathcal{U}(0, 20) $$
In [53]:
lin_reg_code = """
data {
int<lower=0> n;
real x[n];
real y[n];
}
transformed data {}
parameters {
real a;
real b;
real sigma;
}
transformed parameters {
real mu[n];
for (i in 1:n) {
mu[i] <- a*x[i] + b;
}
}
model {
sigma ~ uniform(0, 20);
y ~ normal(mu, sigma);
}
generated quantities {}
"""
n = 11
_a = 6
_b = 2
x = np.linspace(0, 1, n)
y = _a*x + _b + np.random.randn(n)
lin_reg_dat = {
'n': n,
'x': x,
'y': y
}
fit = pystan.stan(model_code=lin_reg_code, data=lin_reg_dat, iter=1000, chains=1)
In [54]:
fit
Out[54]:
In [55]:
fit.plot(['a', 'b']);
plt.tight_layout()
In [56]:
# observed data
df = pd.read_csv('HtWt.csv')
df.head()
Out[56]:
In [57]:
log_reg_code = """
data {
int<lower=0> n;
int male[n];
real weight[n];
real height[n];
}
transformed data {}
parameters {
real a;
real b;
real c;
}
transformed parameters {}
model {
a ~ normal(0, 10);
b ~ normal(0, 10);
c ~ normal(0, 10);
for(i in 1:n) {
male[i] ~ bernoulli(inv_logit(a*weight[i] + b*height[i] + c));
}
}
generated quantities {}
"""
log_reg_dat = {
'n': len(df),
'male': df.male,
'height': df.height,
'weight': df.weight
}
fit = pystan.stan(model_code=log_reg_code, data=log_reg_dat, iter=1000, chains=4)
In [58]:
fit
Out[58]:
In [59]:
df_trace = pd.DataFrame(fit.extract(['c', 'b', 'a']))
pd.scatter_matrix(df_trace[:], diagonal='kde');