Regression from ch 4


In [3]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import pymc3 as pm
import pandas as pd

%matplotlib inline
sns.set(font_scale=1.5)

In [4]:
np.random.seed(314)
N = 100
alfa_real = 2.5
beta_real = 0.9
eps_real = np.random.normal(0, 0.5, size=N)
x = np.random.normal(10, 1, N)
y_real = alfa_real + beta_real * x
y = y_real + eps_real
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.plot(x, y, 'b.')
plt.xlabel('$x$', )
plt.ylabel('$y$', rotation=0)
plt.plot(x, y_real, 'k')
plt.subplot(1,2,2)
sns.distplot(y)
plt.xlabel('$y$')


Out[4]:
Text(0.5, 0, '$y$')

In [5]:
with pm.Model() as model:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=1)
    epsilon = pm.HalfCauchy('epsilon', 5)
    #mu = pm.Deterministic('mu', alpha + beta * x)
    #y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y)
    y_pred = pm.Normal('y_pred', mu= alpha + beta * x, sd=epsilon, observed=y)
    step = pm.Metropolis()
    trace = pm.sample(10000, step=step, chains=2)


Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>Metropolis: [epsilon]
>Metropolis: [beta]
>Metropolis: [alpha]
Sampling 2 chains, 0 divergences: 100%|██████████| 21000/21000 [00:05<00:00, 3780.18draws/s]
The estimated number of effective samples is smaller than 200 for some parameters.

In [6]:
pm.traceplot(trace, compact=True);


This slow wandering in alpha and beta shows an issue!


In [8]:
pm.summary(trace, var_names=['alpha', 'beta', 'epsilon'])


Out[8]:
mean sd hpd_3% hpd_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
alpha 1.669 0.504 0.641 2.435 0.119 0.086 18.0 18.0 19.0 18.0 1.02
beta 0.981 0.049 0.908 1.084 0.012 0.008 18.0 18.0 19.0 19.0 1.02
epsilon 0.474 0.034 0.411 0.536 0.001 0.001 1336.0 1328.0 1356.0 1851.0 1.00

In [9]:
pm.plot_posterior(trace, var_names=['alpha', 'beta', 'epsilon']);



In [10]:
pm.autocorrplot(trace, var_names=['alpha', 'beta', 'epsilon']);



In [11]:
plt.figure(figsize=(10,6))
sns.kdeplot(trace['alpha'], trace['beta'])
plt.xlabel(r'$\alpha$', fontsize=16)
plt.ylabel(r'$\beta$', fontsize=16, rotation=0)


Out[11]:
Text(0, 0.5, '$\\beta$')

Does NUTS do better?


In [12]:
with pm.Model() as model:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=1)
    epsilon = pm.HalfCauchy('epsilon', 5)
    #mu = pm.Deterministic('mu', alpha + beta * x)
    #y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y)
    y_pred = pm.Normal('y_pred', mu= alpha + beta * x, sd=epsilon, observed=y)
    step = pm.Metropolis()
    trace = pm.sample(10000, step=step, chains=2)


Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>Metropolis: [epsilon]
>Metropolis: [beta]
>Metropolis: [alpha]
Sampling 2 chains, 0 divergences: 100%|██████████| 21000/21000 [00:05<00:00, 3873.19draws/s]
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

No!


In [13]:
pm.traceplot(trace, compact=True);



In [14]:
pm.autocorrplot(trace, var_names=['alpha', 'beta', 'epsilon']);



In [15]:
plt.figure(figsize=(10,6))
sns.kdeplot(trace['alpha'], trace['beta'])
plt.xlabel(r'$\alpha$', fontsize=16)
plt.ylabel(r'$\beta$', fontsize=16, rotation=0)


Out[15]:
Text(0, 0.5, '$\\beta$')

Try subtracting the mean


In [16]:
xmean = np.mean(x)
with pm.Model() as model:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=1)
    epsilon = pm.HalfCauchy('epsilon', 5)
    #mu = pm.Deterministic('mu', alpha + beta * x)
    #y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y)
    y_pred = pm.Normal('y_pred', mu= alpha + beta * (x-xmean), sd=epsilon, observed=y)
    step = pm.Metropolis()
    trace = pm.sample(10000, step=step, chains=2)


Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>Metropolis: [epsilon]
>Metropolis: [beta]
>Metropolis: [alpha]
Sampling 2 chains, 0 divergences: 100%|██████████| 21000/21000 [00:05<00:00, 3918.58draws/s]
The number of effective samples is smaller than 10% for some parameters.

In [17]:
pm.traceplot(trace, compact=True);



In [18]:
pm.autocorrplot(trace, var_names=['alpha', 'beta', 'epsilon']);



In [19]:
plt.figure(figsize=(10,6))
sns.kdeplot(trace['alpha'], trace['beta'])
plt.xlabel(r'$\alpha$', fontsize=16)
plt.ylabel(r'$\beta$', fontsize=16, rotation=0)


Out[19]:
Text(0, 0.5, '$\\beta$')

In [20]:
## TODO Sort this out!

plt.plot(x, y, 'b.');
alpha_m = trace['alpha'].mean()
beta_m = trace['beta'].mean()
alpha = alpha_m+beta_m*x
plt.plot(x, alpha + beta_m * (x - xmean) - xmean, c='k', ) #label='y = {:.2f} + {:.2f} * x'.format(alpha, beta_m))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)


No handles with labels found to put in legend.
Out[20]:
<matplotlib.legend.Legend at 0x7fd8925dd550>

Look to problmatic data sets

page 114+


In [21]:
ans = sns.load_dataset('anscombe')
x_3 = ans[ans.dataset == 'III']['x'].values
y_3 = ans[ans.dataset == 'III']['y'].values

In [22]:
X = np.arange(10)+np.random.normal(0, .1, size=10)
Y = 5*X + 10 + np.random.normal(0, 3, size=10)
Y[7] += 50

In [23]:
plt.scatter(X,Y)


Out[23]:
<matplotlib.collections.PathCollection at 0x7fd891fb97d0>

In [24]:
data = pd.DataFrame({'x':X, 'y':Y})
with pm.Model() as model:
    glm_n = pm.glm.GLM.from_formula('y ~ x', data)
    trace_n = pm.sample(2000) # draw 2000 posterior samples using NUTS sampling


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sd, x, Intercept]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:04<00:00, 2033.19draws/s]

In [25]:
glm_n


Out[25]:
$$ \begin{array}{rcl} \text{Intercept} &\sim & \text{Flat}()\\\text{x} &\sim & \text{Normal}(\mathit{mu}=0.0,~\mathit{sigma}=1000.0)\\\text{sd} &\sim & \text{HalfCauchy}(\mathit{beta}=10.0)\\\text{y} &\sim & \text{Normal}(\mathit{mu}=f(f(array,~f(\text{Intercept},~\text{x})),~f()),~\mathit{sigma}=f(\text{sd})) \end{array} $$

In [26]:
pm.traceplot(trace_n)


Out[26]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fd891fa6b50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd892084e10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fd881935bd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd8c138b510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fd891f5a650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd8a1cadfd0>]],
      dtype=object)

In [27]:
ppc_n = pm.sample_posterior_predictive(trace_n, samples=500, model=model, )


/Users/balarsen/miniconda3/envs/python3/lib/python3.7/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 500/500 [00:01<00:00, 370.71it/s]

In [28]:
plt.scatter(X,Y)
perc_n = np.percentile(ppc_n['y'], [2.5, 97.5, 50], axis=0)
plt.fill_between(X, perc_n[0], perc_n[1], color='blue', alpha=0.2)
plt.plot(X, perc_n[2], lw=3)


Out[28]:
[<matplotlib.lines.Line2D at 0x7fd8b1e6b090>]

In [29]:
data = pd.DataFrame({'x':X, 'y':Y})
with pm.Model() as model:
    glm_t = pm.glm.GLM.from_formula('y ~ x', data, family=pm.families.StudentT())
    step = pm.Metropolis()
#     step = pm.NUTS()
    trace_t = pm.sample(10000, step=step, tune=5000) # draw 2000 posterior samples using NUTS sampling


Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Metropolis: [lam]
>Metropolis: [x]
>Metropolis: [Intercept]
Sampling 4 chains, 0 divergences: 100%|██████████| 60000/60000 [00:13<00:00, 4391.11draws/s]
The number of effective samples is smaller than 10% for some parameters.

In [30]:
pm.traceplot(trace_t);



In [31]:
pm.summary(trace_t)


Out[31]:
mean sd hpd_3% hpd_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
Intercept 11.429 1.181 9.134 13.483 0.027 0.019 1986.0 1986.0 2312.0 3199.0 1.0
x 4.756 0.377 4.114 5.391 0.008 0.006 2264.0 2187.0 2325.0 3270.0 1.0
lam 1.305 1.688 0.022 3.925 0.021 0.015 6544.0 6544.0 5239.0 6054.0 1.0

In [32]:
glm_t


Out[32]:
$$ \begin{array}{rcl} \text{Intercept} &\sim & \text{Flat}()\\\text{x} &\sim & \text{Normal}(\mathit{mu}=0.0,~\mathit{sigma}=1000.0)\\\text{lam} &\sim & \text{HalfCauchy}(\mathit{beta}=10.0)\\\text{y} &\sim & \text{StudentT}(\mathit{nu}=1.0,~\mathit{mu}=f(f(array,~f(\text{Intercept},~\text{x})),~f()),~\mathit{lam}=f(\text{lam})) \end{array} $$

In [33]:
fig, ax = plt.subplots(figsize=(10,5))

ppc_t = pm.sample_posterior_predictive(trace_t, samples=1000, model=model, )
perc_t = np.percentile(ppc_t['y'], [2.5, 97.5, 50], axis=0)
plt.fill_between(X, perc_t[0], perc_t[1], color='blue', alpha=0.2)
plt.fill_between(X, perc_n[0], perc_n[1], color='orange', alpha=0.2)

plt.plot(X, perc_t[2], lw=3, label='StudentT')
plt.plot(X, perc_n[2], lw=3, label='Normal', color='orange')
plt.scatter(X,Y, s=70)
plt.legend(loc='upper left')


/Users/balarsen/miniconda3/envs/python3/lib/python3.7/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 1000/1000 [00:01<00:00, 853.57it/s]
Out[33]:
<matplotlib.legend.Legend at 0x7fd8615e0990>

In [34]:
pm.plot_posterior_predictive_glm(trace_t, eval=X, samples=100)
pm.plot_posterior_predictive_glm(trace_n, eval=X, samples=100, color='b')

plt.scatter(X,Y, s=70)


Out[34]:
<matplotlib.collections.PathCollection at 0x7fd8615e0450>

Hierarchical linear regression

page 117


In [35]:
N = 20
M = 8
idx = np.repeat(range(M-1), N)
idx = np.append(idx, 7)
alpha_real = np.random.normal(2.5, 0.5, size=M)
beta_real = np.random.beta(60, 10, size=M)
eps_real = np.random.normal(0, 0.5, size=len(idx))
y_m = np.zeros(len(idx))
x_m = np.random.normal(10, 1, len(idx))
y_m = alpha_real[idx] + beta_real[idx] * x_m + eps_real

In [46]:
fig, ax = plt.subplots(2, 4, figsize=(12,5), sharex=True, sharey=True)
ax = ax.flatten()
j, k = 0, N
for i in range(M):
    ax[i].scatter(x_m[j:k], y_m[j:k])
#     ax[i].set_xlim(6, 15)
#     ax[i].set_ylim(7, 17)
    j += N
    k += N
ax[0].set_xlim((6,15))
ax[0].set_ylim((6,15))

plt.tight_layout()



In [47]:
# center the data
x_centered = x_m - x_m.mean()

First we are going to fit a non-hierarchical model, just as we have already seen. The only difference is that now we are going to include code to re-scale alpha to the original scale.


In [49]:
with pm.Model() as unpooled_model:
    alpha_tmp = pm.Normal('alpha_tmp', mu=0, sd=10, shape=M)
    beta = pm.Normal('beta', mu=0, sd=10, shape=M)
    epsilon = pm.HalfCauchy('epsilon', 5)
    nu = pm.Exponential('nu', 1/30)
    y_pred = pm.StudentT('y_pred', mu=alpha_tmp[idx] + beta[idx] * x_centered, sd=epsilon, nu=nu, observed=y_m)
    alpha = pm.Deterministic('alpha', alpha_tmp - beta * x_m.mean())
    step = pm.NUTS(scaling=start)
    trace_up = pm.sample(2000, step=step)


INFO (theano.gof.compilelock): Refreshing lock /Users/balarsen/.theano/compiledir_Darwin-18.7.0-x86_64-i386-64bit-i386-3.7.6-64/lock_dir/lock
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, epsilon, beta, alpha_tmp]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:08<00:00, 1147.40draws/s]
The number of effective samples is smaller than 10% for some parameters.

In [50]:
varnames=['alpha', 'beta', 'epsilon', 'nu']
pm.traceplot(trace_up, varnames);



In [53]:
with pm.Model() as hierarchical_model:
    alpha_tmp_mu = pm.Normal('alpha_tmp_mu', mu=0, sd=10)
    alpha_tmp_sd = pm.HalfNormal('alpha_tmp_sd', 10)
    beta_mu = pm.Normal('beta_mu', mu=0, sd=10)
    beta_sd = pm.HalfNormal('beta_sd', sd=10)
    alpha_tmp = pm.Normal('alpha_tmp', mu=alpha_tmp_mu, sd=alpha_tmp_sd, shape=M)
    beta = pm.Normal('beta', mu=beta_mu, sd=beta_sd, shape=M)
    epsilon = pm.HalfCauchy('epsilon', 5)
    nu = pm.Exponential('nu', 1/30)
    y_pred = pm.StudentT('y_pred', mu=alpha_tmp[idx] + beta[idx] *
    x_centered, sd=epsilon, nu=nu, observed=y_m)
    alpha = pm.Deterministic('alpha', alpha_tmp - beta * x_m.
    mean())
    alpha_mu = pm.Deterministic('alpha_mu', alpha_tmp_mu - beta_mu
    * x_m.mean())
    alpha_sd = pm.Deterministic('alpha_sd', alpha_tmp_sd - beta_mu
    * x_m.mean())
    mu, sds, elbo = pm.variational.ADVI(n=100000, verbose=False)
    cov_scal = np.power(hierarchical_model.dict_to_array(sds), 2)
    step = pm.NUTS(scaling=cov_scal, is_cov=True)
    trace_hm = pm.sample(1000, step=step, start=mu)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-abbb904fcbeb> in <module>
     16     alpha_sd = pm.Deterministic('alpha_sd', alpha_tmp_sd - beta_mu
     17     * x_m.mean())
---> 18     mu, sds, elbo = pm.variational.ADVI(n=100000, verbose=False)
     19     cov_scal = np.power(hierarchical_model.dict_to_array(sds), 2)
     20     step = pm.NUTS(scaling=cov_scal, is_cov=True)

TypeError: cannot unpack non-iterable ADVI object

In [ ]: