In [1]:
import copy
import numpy as np 
import matplotlib.pyplot as plt
import functions as f
import scipy.stats as stats
import statsmodels.api as sm
from pandas import DataFrame
%matplotlib inline

Bayesian Binary Regression

This notebook attempts to implement the pseudo-code from the paper "Bayesian Auxiliary Variable Models for Binary and Multinomial Regression" by Chris C. Holmes and Leonhard Held from Bayesian Analysis, 2006.

http://ba.stat.cmu.edu/journal/2006/vol01/issue01/held.pdf

Corrections to paper

http://ba.stat.cmu.edu/journal/2011/vol06/issue02/vanderlans.pdf

Binary Regression Model

Consider the Bayesian binary regression model,

\begin{eqnarray} y_i &\sim& Bernoulli(g^{-1}(\eta_i)) \\ \eta_i &=& x_i \beta \\ \beta &\sim& \pi(\beta) \end{eqnarray}

where $y_i \in \{0,1\}, i=1,\dots,n$ is a binary response variable for a collection of $n$ objects, with $p$ covariate measurements $x_i = (x_{i1}, \dots, x_{ip})$. $g(u)$ is a link function, $\eta_i$ denotes the linear predictor and $\beta$ represents a $(p\times1)$ column vector of regression coefficients which a priori are from some distribution $\pi(\cdot)$

Probit Regression using Auxiliary variables

see also: http://www.stat.cmu.edu/~brian/905-2009/all-papers/albert-chib-1993.pdf

For the probit link - $g^{-1}(u) = \Phi(u)$, where $\Phi(u)$ denotes the cumulative distribution function of the standard normal random variable, the general representation of the binary regression problem is as follows:

\begin{eqnarray} y_i &\sim& \left\{ \begin{array}{ll} 1 & z_i > 0 \\ 0 & o/w \\ \end{array} \right. \\ z_i &=& x_i \beta + \epsilon_i\\ \epsilon_i &\sim& N(0,1) \\ \beta &\sim& \pi(\beta) \end{eqnarray}

Now $y_i$ is determined by the sign of the auxiliary variable $z_i$.

Implementing A1: Procedure for joint sampling in Bayesian probit

For testing purposes we will use the data outlined in the Albery & Chib paper for the probit model. Specifically the Finney data from 1947 where the probit model of interest is

\begin{eqnarray*} \Phi^{-1}(p_i) = \beta_0 + \beta_1 x_{1i} + \beta_2 x_{2i}, \mbox{ where,} i=1,\dots,39 \end{eqnarray*}

where $x_{1i}$ is the volume of air inspired, $x_{2i}$ is the rate of air inspired, and the binary outcome observed is the occurence or nonoccurrence on a transient vasorestriction on the skin of the digits.


In [2]:
finney47 = np.genfromtxt('data/finney1947.csv', dtype=None, delimiter=',', names=True)
DataFrame(finney47).tail(n=5)


Out[2]:
Volume Rate Y
34 1.20 2.000 1
35 0.80 3.330 1
36 0.95 1.900 0
37 0.75 1.900 0
38 1.30 1.625 1

This section also assumes that the prior on $\beta$ is $\pi(\beta) = N(0, v)$ and that the design matrix X is of dimension $(n\times p)$. Taking a hint from the Holmes and Held paper, we are going to set up the prior as $\pi(\beta) = N(0,100I_p)$


In [3]:
# Setting up the design Matrix, X and the priors for beta
yi = finney47['Y']

X = np.vstack((np.ones(len(finney47)), finney47['Volume'], finney47['Rate']))
b = np.zeros(3)
v = 100 * np.identity(3)

X = X.transpose()
yi = yi.transpose()

n_para = X.shape[1]
n_obs = len(yi)

In [4]:
#First record constants unaltered within MCMC loop
# print np.dot(X, X.transpose())
V = np.linalg.pinv(np.dot(X.transpose(), X) + np.linalg.pinv(v))
L = np.linalg.cholesky(V)  
# L stores the lower triangular Cholesky factorization of V

S = np.dot(V, X.transpose())

In [6]:
#For j=1 to number of observations
H = np.empty(n_obs)
for i in np.arange(n_obs):
    H[i] = np.dot(X[i,:], S[:,i])
# H = (X * S).diagonal().transpose()
W = H / (1 - H)
Q = W + 1
H


Out[6]:
array([ 0.24370956,  0.21235215,  0.050006  ,  0.04919015,  0.10316222,
        0.13701691,  0.11601647,  0.02876299,  0.08799574,  0.11996627,
        0.11534102,  0.07145585,  0.0870003 ,  0.04361995,  0.17148908,
        0.06643554,  0.1823102 ,  0.04634394,  0.03944307,  0.03727534,
        0.06267793,  0.04311477,  0.03045785,  0.02928646,  0.02957052,
        0.06009279,  0.03329677,  0.03230623,  0.04717733,  0.08508416,
        0.10114664,  0.12456036,  0.02832969,  0.03470227,  0.02900802,
        0.11731141,  0.03230623,  0.04050889,  0.02610946])

In [8]:
# Initialise latent variable Z, from truncated normal

Z = np.empty(n_obs).transpose()
for i, y in enumerate(yi):
    if y:
        Z[i] = stats.truncnorm.rvs(0., float('inf'), 0, 1)
    else:
        Z[i] = stats.truncnorm.rvs(float('-inf'), 0., 0, 1)

### Holmes and Held says to initialize Z ~ N(0, I_n)Ind(Y,Z).
### Instead of sampling from a multivariate truncated normal,
### the above is used since each Zi, Zj is independent by the 
### specification of the identity matrix as the variance.
### I really hope this assumption holds............

B = np.dot(S,Z)
B.shape
# B denotes the conditional mean of \beta


Out[8]:
(3,)

In [7]:
n_iter = 10000
low, mid, high = float('-inf'), 0., float('inf')
betas = np.empty(n_para)
for i in np.arange(n_iter):
    if (i+1) % 1000. == 0.:
        f.progress(i+1, n_iter)
    z_old = copy.copy(Z)
    for j in np.arange(n_obs):
        m = np.dot(X[j,:],B)
        m = m - np.dot(W[j],(Z[j] - m))
        if yi[j]:
            Z[j] = stats.truncnorm.rvs((mid - m) / Q[j], (high - m) / Q[j], loc=m, scale=Q[j])
        else:
            Z[j] = stats.truncnorm.rvs((low - m) / Q[j], (mid - m) / Q[j], loc=m, scale=Q[j])

        B = B + np.float((Z[j] - z_old[j]))*S[:,j]
    T = stats.multivariate_normal.rvs(np.zeros(n_para), np.identity(n_para), 1).transpose()
    beta_i = (B + np.dot(L,T)).transpose()
    betas = np.vstack((betas, beta_i))
print "\n{0} Simulations complete".format(n_iter)
betas = betas[1:,:]
print betas[0:10,:]


100.0% of iterations complete
10000 Simulations complete
[[-2.98611117  1.27469026  0.72308704]
 [-3.69262931  1.47711142  1.06954999]
 [-4.7587381   1.91565355  1.34889084]
 [-6.28545098  2.15679209  1.74783257]
 [-6.45952633  2.64684508  1.75803751]
 [-6.66293192  2.5278541   1.94170481]
 [-7.11032209  2.74048436  2.1452832 ]
 [-6.55671787  2.7666335   1.89657263]
 [-4.96373947  2.30292586  1.32037071]
 [-4.76995005  1.99356349  1.6611888 ]]

In [8]:
F = plt.figure()
plt.hist(betas[1000:,0], 50, normed=True)
F.set_size_inches(10,5)
plt.show()



In [9]:
np.mean(betas[2000:,0])


Out[9]:
-5.8485239553723938

In [10]:
F = plt.figure()
plt.hist(betas[1000:,1], 50, normed=True)
F.set_size_inches(10,5)
plt.show()



In [11]:
np.mean(betas[1000:,1])


Out[11]:
2.3683682398311618

In [12]:
F = plt.figure()
plt.hist(betas[1000:,2], 50, normed=True)
F.set_size_inches(10,5)
plt.show()



In [13]:
np.mean(betas[1000:,2])


Out[13]:
1.678874250272572

In [15]: