In [40]:
library(R2jags)
In [41]:
# Exam Scores
s = "
model{
# Each Person Belongs To One Of Two Latent Groups
for (i in 1:p){
z[i] ~ dbern(0.5)
}
# First Group Guesses
psi <- 0.5
# Second Group Has Some Unknown Greater Rate Of Success
phi ~ dbeta(1,1)I(0.5,1)
# Data Follow Binomial With Rate Given By Each Person's Group Assignment
for (i in 1:p){
theta[i] <- equals(z[i],0)*psi+equals(z[i],1)*phi
k[i] ~ dbin(theta[i],n)
}
}
"
In [43]:
k <- c(21,17,21,18,22,31,31,34,34,35,35,36,39,36,35)
p <- length(k) #number of people
n <- 40 # number of questions
In [44]:
data <- list("p", "k", "n") # to be passed on to JAGS
myinits <- list(
list(phi = 0.75, z = round(runif(p)))) # Initial group assignment
In [45]:
# parameters to be monitored:
parameters <- c("phi","z")
In [46]:
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s), n.chains=1, n.iter=1000,
n.burnin=1, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [47]:
samples
Out[47]:
In [48]:
plot(samples)
In [50]:
library(rstan)
In [51]:
model <- "
// Exam Scores
data {
int<lower=1> p;
int<lower=0> k[p];
int<lower=1> n;
}
transformed data {
real psi;
// First Group Guesses
psi <- .5;
}
parameters {
// Second Group Has Some Unknown Greater Rate Of Success
real<lower=.5,upper=1> phi;
}
transformed parameters {
vector[2] lp_parts[p];
// Data Follow Binomial With Rate Given By Each Person's Group Assignment
for (i in 1:p) {
lp_parts[i,1] <- log(.5) + binomial_log(k[i], n, phi);
lp_parts[i,2] <- log(.5) + binomial_log(k[i], n, psi);
}
}
model {
for (i in 1:p)
increment_log_prob(log_sum_exp(lp_parts[i]));
}
generated quantities {
int<lower=0,upper=1> z[p];
for (i in 1:p) {
vector[2] prob;
prob <- softmax(lp_parts[i]);
z[i] <- bernoulli_rng(prob[1]);
}
}"
In [52]:
k <- c(21, 17, 21, 18, 22, 31, 31, 34, 34, 35, 35, 36, 39, 36, 35)
p <- length(k) # number of people
n <- 40 # number of questions
In [53]:
data <- list(p=p, k=k, n=n) # to be passed on to Stan
In [54]:
myinits <- list(
list(phi=.75))
parameters <- c("phi", "z") # parameters to be monitored
In [56]:
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=1000,
chains=1,
thin=1,
# warmup = 100, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [57]:
print(samples, digits=3)
In [58]:
plot(samples)
The previous example shows how sampling can model data as coming from a mixture of sources, and infer properties of these latent groups. But the specific model has at least one big weakness, which is that it assumes all the people in the knowledge group have exactly the same rate of success on the questions.
In [88]:
# Exam Scores With Individual Differences
s <- "
model{
# Data Follow Binomial With Rate Given By Each Person's Group Assignment
for (i in 1:p){
theta[i] <- equals(z[i],0)*psi+equals(z[i],1)*phi[i]
k[i] ~ dbin(theta[i],n)
}
# Each Person Belongs To One Of Two Latent Groups
for (i in 1:p){
z[i] ~ dbern(0.5)
}
# The Second Group Allows Individual Differences
for (i in 1:p){
# Second Group Drawn From A Censored Gaussian Distribution
phi[i] ~ dnorm(mu,lambda)T(0,1)
}
# First Group Guesses
psi <- 0.5
# Second Group Mean, Precision (And Standard Deviation)
mu ~ dbeta(1,1)T(.5,1) # >0.5 Average Success Rate
lambda ~ dgamma(.001,.001)
sigma <- 1/sqrt(lambda)
# Posterior Predictive For Second Group
predphi ~ dnorm(mu,lambda)T(0,1)
}
"
In [89]:
k <- c(21,17,21,18,22,31,31,34,34,35,35,36,39,36,35)
p <- length(k) #number of people
n <- 40 # number of questions
In [90]:
data <- list("p", "k", "n") # to be passed on to JAGS
In [91]:
myinits <- list(
list(mu = 0.75, lambda = 1, z = round(runif(p)))) # Initial group assignment
In [92]:
# parameters to be monitored:
parameters <- c("predphi","theta","z","mu","sigma")
In [93]:
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s), n.chains=1, n.iter=2000,
n.burnin=1000, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [94]:
samples
Out[94]:
In [95]:
plot(samples)
In [96]:
model <- "
// Exam Scores With Individual Differences
data {
int<lower=1> p;
int<lower=0> k[p];
int<lower=1> n;
}
transformed data {
real psi;
// First Group Guesses
psi <- .5;
}
parameters {
vector<lower=0,upper=1>[p] phi;
real<lower=.5,upper=1> mu; // Second Group Mean
real<lower=0> lambda;
real<lower=0,upper=1> predphi;
}
transformed parameters {
vector[2] lp_parts[p];
real<lower=0> sigma;
// Second Group Standard Deviation
sigma <- inv_sqrt(lambda);
// Data Follow Binomial With Rate Given By Each Person's Group Assignment
// Each Person Belongs To One Of Two Latent Groups
for (i in 1:p) {
lp_parts[i, 1] <- log(.5) + binomial_log(k[i], n, phi[i]);
lp_parts[i, 2] <- log(.5) + binomial_log(k[i], n, psi);
}
}
model {
// Second Group Precision
lambda ~ gamma(.001, .001);
// Posterior Predictive For Second Group
predphi ~ normal(mu, sigma)T[0,1];
// Second Group Drawn From A Censored Gaussian Distribution
for (i in 1:p)
phi[i] ~ normal(mu, sigma)T[0,1];
for (i in 1:p)
increment_log_prob(log_sum_exp(lp_parts[i]));
}
generated quantities {
int<lower=0,upper=1> z[p];
vector<lower=0,upper=1>[p] theta;
for (i in 1:p) {
vector[2] prob;
prob <- softmax(lp_parts[i]);
z[i] <- bernoulli_rng(prob[1]);
theta[i] <- (z[i] == 0) * psi + (z[i] == 1) * phi[i];
}
}"
In [97]:
k <- c(21, 17, 21, 18, 22, 31, 31, 34, 34, 35, 35, 36, 39, 36, 35)
p <- length(k) # number of people
n <- 40 # number of questions
In [98]:
data <- list(p=p, k=k, n=n) # to be passed on to Stan
In [99]:
myinits <- list(
list(phi=runif(p, .5, 1), mu=.75, lambda=1, predphi=.75))
In [100]:
# parameters to be monitored:
parameters <- c("predphi", "theta", "z", "mu", "sigma")
In [101]:
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=20000,
chains=1,
thin=1,
# warmup = 100, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [102]:
print(samples, digits=3)
In [103]:
plot(samples)
In [59]:
# Twenty Questions
s = "
model{
# Correctness Of Each Answer Is Bernoulli Trial
for (i in 1:np){
for (j in 1:nq){
k[i,j] ~ dbern(theta[i,j])
}
}
# Probability Correct Is Product Of Question By Person Rates
for (i in 1:np){
for (j in 1:nq){
theta[i,j] <- p[i]*q[j]
}
}
# Priors For People and Questions
for (i in 1:np){
p[i] ~ dbeta(1,1)
}
for (j in 1:nq){
q[j] ~ dbeta(1,1)
}
}
"
In [65]:
# Twenty Questions (NA)
s_NA = "
model{
# Correctness Of Each Answer Is Bernoulli Trial
for (i in 1:np){
for (j in 1:nq){
k[i,j] ~ dbern(theta[i,j])
}
}
# Probability Correct Is Product Of Question By Person Rates
for (i in 1:np){
for (j in 1:nq){
theta[i,j] <- p[i]*q[j]
}
}
# Priors For People and Questions
for (i in 1:np){
p[i] ~ dbeta(1,1)
}
for (j in 1:nq){
q[j] ~ dbeta(1,1)
}
NA.array[1] <- k[1,13]
NA.array[2] <- k[8,5]
NA.array[3] <- k[10,18]
}
"
In [60]:
dset <- 1
if (dset==1)
{
k <- c(1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,0,0,
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,
1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,
1,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0)
}
if (dset==2)
{
k <- c(1,1,1,1,0,0,1,1,0,1,0,0,NA,0,0,1,0,1,0,0,
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,
1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,
1,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,NA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,NA,0,0)
}
In [61]:
k <- matrix(k, nrow=10, byrow=T)
In [62]:
np <- nrow(k)
nq <- ncol(k)
In [63]:
data <- list("np","nq","k") # to be passed on to JAGS
In [64]:
myinits <- list(
list(q = runif(nq), p = runif(np)))
In [67]:
if (dset==1)
{
# parameters to be monitored:
parameters <- c("p", "q")
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s), n.chains=1, n.iter=10000,
n.burnin=1, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
}
if (dset==2)
{
# parameters to be monitored:
parameters <- c("p", "q", "NA.array")
# The following command calls JAGS with specific options.
# For a detailed description the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s_NA), n.chains=1, n.iter=10000,
n.burnin=1, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
}
In [68]:
samples
Out[68]:
In [69]:
plot(samples)
# When you use WinBUGS and you want to sample the missing data, you cannot
# just return "k" to R, as you can with Matlab. Most likely, the reason is
# that "k" is a matrix, and, say, k[1,13] is a single number. WinBUGS will
# sample k[1,13] when it contains "NA", but R will not accept a sequence of
# samples for the cell k[1,13]. This means the following:
# 1. Using R2WinBUGS, you can tell WinBUGS to monitor and return "k".
# When you set "debug=T", you can inspect the results, but, when you return
# to R, WinBUGS is unable to pass on its results.
# 2. One clumsy way around this problem is illustrated in the code
# TwentyQuestions_NA.txt.
# Note that JAGS does not seem to have this problem and you can just
# examine "k".
In [71]:
#### Notes to Stan model #######################################################
## 1) There are two models in this script. The first one is not able to
## incorporate missing values as is required for exercise 6.3.2. The second
## model is able to do that. Actually you can use the second model on both
## datasets - with or without NAs.
## 2) Models with missing values can be tricky. It's important to know that
## Stan treats variables either as know or unknown, therefore you have to
## separate both parts first and then make it work somehow. Chapter on
## Missing Data in Stan manual is useful.
################################################################################
In [72]:
model1 <- "
// Twenty Questions
data {
int<lower=0> np; # rows
int<lower=0> nq; # columns
int k[np,nq];
}
parameters {
real<lower=0,upper=1> p[np];
real<lower=0,upper=1> q[nq];
}
transformed parameters {
real<lower=0,upper=1> theta[np, nq];
// Probability Correct Is Product Of Question By Person Rates
for (i in 1:np)
for (j in 1:nq)
theta[i,j] <- p[i] * q[j];
}
model {
// Priors For People and Questions
p ~ beta(1, 1);
q ~ beta(1, 1);
// Correctness Of Each Answer Is Bernoulli Trial
for (i in 1:np)
for (j in 1:nq)
k[i, j] ~ bernoulli(theta[i,j]);
}"
In [73]:
model2 <- "
# Twenty Questions
data {
int<lower=0> np; // rows
int<lower=0> nq; // columns
int k[np,nq];
int k_na[np,nq]; // locating NAs in k
int n_na; // number of NAs
}
parameters {
real<lower=0,upper=1> p[np];
real<lower=0,upper=1> q[nq];
}
transformed parameters {
real<lower=0,upper=1> theta[np,nq];
// Probability Correct Is Product Of Question By Person Rates
for (i in 1:np)
for (j in 1:nq)
theta[i,j] <- p[i] * q[j];
}
model {
// Priors For People and Questions
p ~ beta(1, 1);
q ~ beta(1, 1);
// Correctness Of Each Answer Is Bernoulli Trial
for (i in 1:np)
for (j in 1:nq)
if (k_na[i,j] == 0) // If k[i,j] is not missing
k[i,j] ~ bernoulli(theta[i,j]);
}
generated quantities {
int na_array[n_na];
int index;
index <- 1;
for (i in 1:np) {
for (j in 1:nq) {
if (k_na[i,j] == 1) { // If k[i,j] is missing
na_array[index] <- bernoulli_rng(theta[i,j]);
index <- index + 1;
}
}
}
}"
In [75]:
dset <- 1 # Chooes dataset/model
if (dset == 1) {
k <- c(1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,0,0,
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,
1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,
1,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0)
}
if (dset == 2) {
k <- c(1,1,1,1,0,0,1,1,0,1,0,0,NA,0,0,1,0,1,0,0,
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,
1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,
1,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,NA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,NA,0,0)
}
In [76]:
k <- matrix(k, nrow=10, byrow=T)
np <- nrow(k)
nq <- ncol(k)
In [77]:
if (dset == 1) {
model <- model1
data <- list(k=k, np=np, nq=nq) # to be passed on to Stan
parameters <- c("p", "q") # parameters to be monitored
} else if (dset == 2) {
model <- model2
k_na <- is.na(k) + 0 # Matrix locating missing values: 1 = missing
n_na <- sum(k_na) # number of missing values
k[is.na(k)] <- 99 # some numeric value, since Stan doesn't eat NAs
# to be passed on to Stan:
data <- list(k=k, np=np, nq=nq, k_na=k_na, n_na=n_na)
parameters <- c("p", "q", "na_array") # parameters to be monitored
}
In [78]:
myinits <- list(
list(q=runif(nq), p=runif(np)))
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=10000,
chains=1,
thin=1,
# warmup = 100, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [79]:
print(samples, digits=3)
In [80]:
plot(samples)
Now suppose that three of the answers were not recorded, for what- ever reason. Our new data set, with missing data, now takes the form shown in Table 6.2. Bayesian inference will automatically make predictions about these missing values (i.e., “fill in the blanks”) by using the same probabilis- tic model that generated the observed data. Missing data are entered as nan (“not a number”) in Matlab, and NA (“not available”) in R or WinBUGS. Including the variable k as one to monitor when sampling will then provide posterior values for the missing values. That is, it provides information about the relative likelihood of the missing values being each of the possible alter- natives, using the statistical model and the available data. Look through the Matlab or R code to see how all of this is implemented in the second data set. Run the code, and interpret the posterior distributions for the three missing values. Are they reasonable inferences?
In [112]:
# The Two Country Quiz
s <- "
model{
# Probability of Answering Correctly
alpha ~ dunif(0,1) # Match
beta ~ dunif(0,alpha) # Mismatch
# Group Membership For People and Questions
for (i in 1:nx){
x[i] ~ dbern(0.5)
x1[i] <- x[i]+1
}
for (j in 1:nz){
z[j] ~ dbern(0.5)
z1[j] <- z[j]+1
}
# Probability Correct For Each Person-Question Combination By Groups
for (i in 1:nx){
for (j in 1:nz){
theta[i,j,1,1] <- alpha
theta[i,j,1,2] <- beta
theta[i,j,2,1] <- beta
theta[i,j,2,2] <- alpha
}
}
# Data Are Bernoulli By Rate
for (i in 1:nx){
for (j in 1:nz){
k[i,j] ~ dbern(theta[i,j,x1[i],z1[j]])
}
}
}
"
In [113]:
s_NA1 <- ""
In [114]:
s_NA2 <- ""
In [115]:
dset <- 1
if (dset==1)
{
k <- c(1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
0,1,1,0,0,1,0,0,
0,1,1,0,0,1,1,0,
1,0,0,1,1,0,0,1,
0,0,0,1,1,0,0,1,
0,1,0,0,0,1,1,0,
0,1,1,1,0,1,1,0)
k <- matrix(k, nrow=8, byrow=T)
}
if (dset==2)
{
k <- c(1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
0,1,1,0,0,1,0,0,
0,1,1,0,0,1,1,0,
1,0,0,1,1,0,0,1,
0,0,0,1,1,0,0,1,
0,1,0,0,0,1,1,0,
0,1,1,1,0,1,1,0,
1,0,0,1,NA,NA,NA,NA,
0,NA,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA,NA,NA,NA,NA)
k <- matrix(k, nrow=11, byrow=T)
}
if (dset==3)
{
k <- c(1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
1,0,0,1,1,0,0,1,
0,1,1,0,0,1,0,0,
0,1,1,0,0,1,1,0,
1,0,0,1,1,0,0,1,
0,0,0,1,1,0,0,1,
0,1,0,0,0,1,1,0,
0,1,1,1,0,1,1,0,
1,0,0,1,NA,NA,NA,NA,
0,NA,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA,NA,NA,NA,NA)
k <- matrix(k, nrow=21, byrow=T)
}
In [116]:
nx <- nrow(k)
nz <- ncol(k)
In [117]:
data <- list("nx","nz","k") # to be passed on to JAGS
In [118]:
inits <- list(
list(z = round(runif(nz)), x = round(runif(nx)), alpha=0.5, beta=0.5))
In [119]:
if (dset==1)
{
# parameters to be monitored:
parameters <- c("z", "x", "alpha", "beta")
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits, parameters,
textConnection(s), n.chains=1, n.iter=2000,
n.burnin=1000, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
}
if (dset==2)
{
# parameters to be monitored:
parameters <- c("z", "x", "alpha", "beta", "NA.LP1", "NA.LP2", "NA.LP3")
# The following command calls JAGS with specific options.
# For a detailed description the R2jags documentation.
# model.file ="TwoCountryQuiz_NA1.txt"
samples <- jags(data, inits, parameters,
textConnection(s_NA1), n.chains=1,
n.iter=2000, n.burnin=1000, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
}
if (dset==3)
{
# parameters to be monitored:
parameters <- c("z", "x", "alpha", "beta", "NA.LP1", "NA.LP2", "NA.LP3")
# The following command calls JAGS with specific options.
# For a detailed description the R2jags documentation.
# model.file ="TwoCountryQuiz_NA2.txt"
samples <- jags(data, inits, parameters,
textConnection(s_NA2), n.chains=1,
n.iter=2000, n.burnin=1000, n.thin=1, DIC=T)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
}
In [120]:
samples
Out[120]:
In [121]:
plot(samples)
In [122]:
# Unfortunately, this model is not implemented! Search "double mixture model"
# on mailing list to find out why.
# If you have some suggestions about the model, let us know!
Now suppose that three extra people enter the room late, and begin to take the quiz. One of them (Late Person 1) has answered the first four questions, the next (Late Person 2) has only answered the first question, and the final new person (Late Person 3) is still sharpening their pencil, and has not started the quiz. This situation can be represented as an updated data set, now with missing data, as in Table 6.4. Interpret the inferences the model makes about the nationality of the late people, and whether or not they will get the unfinished questions correct.
Finally, suppose that you are now given the correctness scores for a set of 10 new people, whose data were not previously available, but who form part of the same group of people we are studying. The updated data set is shown in Table 6.5. Interpret the inferences the model makes about the nationality of the new people. Revisit the inferences about the late people, and whether or not they will get the unfinished questions correct. Does the inference drawn by the model for the third late person match your intuition? There is a problem here. How could it be fixed?
In [123]:
# Malingering
s <- "
model{
# Each Person Belongs to One of Two Latent Groups
for (i in 1:p){
z[i] ~ dbern(0.5)
z1[i] <- z[i]+1
}
# Bona Fide Group has Unknown Success Rate Above Chance
psi[1] ~ dunif(0.5,1)
# Malingering Group has Unknown Success Rate Below Bona Fide
psi[2] ~ dunif(0,psi[1])
# Data are Binomial with Group Rate for Each Person
for (i in 1:p){
theta[i] <- psi[z1[i]]
k[i] ~ dbin(theta[i],n)
}
}
"
In [124]:
k <- c(45,45,44,45,44,45,45,45,45,45,30,20,6,44,44,27,25,17,14,27,35,30)
p <- length(k) # number of people
n <- 45 # number of questions
In [125]:
data <- list("p", "k", "n") # to be passed on to JAGS
In [126]:
myinits <- list(
list(psi = c(0.7,0.5), z = round(runif(p)))) # Initial group assignment
In [127]:
# parameters to be monitored:
parameters <- c("psi","z")
In [128]:
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s), n.chains=1, n.iter=2000,
n.burnin=1000, n.thin=1, DIC=T)
In [129]:
samples
Out[129]:
In [130]:
plot(samples)
In [132]:
model <- "
// Malingering
data {
int<lower=1> n;
int<lower=1> p;
int<lower=0,upper=n> k[p];
}
parameters {
vector<lower=0,upper=1>[2] psi;
}
transformed parameters {
vector[2] lp_parts[p];
for (i in 1:p) {
lp_parts[i, 1] <- log(.5) + binomial_log(k[i], n, psi[1]);
lp_parts[i, 2] <- log(.5) + binomial_log(k[i], n, psi[2]);
}
}
model {
// Bona Fide Group has Unknown Success Rate Above Chance
psi[1] ~ uniform(.5, 1);
// Malingering Group has Unknown Success Rate Below Bona Fide
psi[2] ~ uniform(0, psi[1]);
for (i in 1:p)
increment_log_prob(log_sum_exp(lp_parts[i]));
}
generated quantities {
int<lower=0,upper=1> z[p];
for (i in 1:p) {
vector[2] prob;
prob <- softmax(lp_parts[i]);
z[i] <- bernoulli_rng(prob[2]);
}
}"
k <- c(45, 45, 44, 45, 44, 45, 45, 45, 45, 45, 30,
20, 6, 44, 44, 27, 25, 17, 14, 27, 35, 30)
p <- length(k) # number of people
n <- 45 # number of questions
data <- list(p=p, k=k, n=n) # To be passed on to Stan
myinits <- list(
list(psi=c(.7, .5)))
parameters <- c("psi","z") # Parameters to be monitored
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=2000,
chains=1,
thin=1,
# warmup = 100, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
print(samples, digits=3)
In [133]:
plot(samples)
In [134]:
# Malingering, with Individual Differences
s <- "
model{
# Each Person Belongs to One of Two Latent Groups
for (i in 1:p){
z[i] ~ dbern(phi) # phi is the Base Rate
z1[i] <- z[i]+1
}
# Relatively Uninformative Prior on Base Rate
phi ~ dbeta(5,5)
# Data are Binomial with Rate Given by
# Each Person's Group Assignment
for (i in 1:p){
k[i] ~ dbin(theta[i,z1[i]],n)
theta[i,1] ~ dbeta(alpha[1],beta[1])
theta[i,2] ~ dbeta(alpha[2],beta[2])
}
# Transformation to Group Mean and Precision
alpha[1] <- mubon * lambdabon
beta[1] <- lambdabon * (1-mubon)
# Additivity on Logit Scale
logit(mumal) <- logit(mubon) - mudiff
alpha[2] <- mumal * lambdamal
beta[2] <- lambdamal * (1-mumal)
# Priors
mubon ~ dbeta(1,1)
mudiff ~ dnorm(0,0.5)T(0,) # Constrained to be Positive
lambdabon ~ dunif(40,800)
lambdamal ~ dunif(4,100)
}
"
In [135]:
k <- c(45,45,44,45,44,45,45,45,45,45,30,20,6,44,44,27,25,17,14,27,35,30)
p <- length(k) # number of people
n <- 45 # number of questions
data <- list("p", "k", "n") # to be passed on to JAGS
myinits <- list(
list(z = round(runif(p)), mudiff=0.2),
list(z = round(runif(p)), mudiff=0.3),
list(z = round(runif(p)), mudiff=0.4))
# parameters to be monitored:
parameters <- c("theta","z","mubon","lambdabon",
"mumal","lambdamal","mudiff","phi")
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples <- jags(data, inits=myinits, parameters,
textConnection(s), n.chains=3, n.iter=10000,
n.burnin=1000, n.thin=1, DIC=T)
In [138]:
samples
Out[138]:
In [139]:
plot(samples)
In [140]:
model <- "
// Malingering, with Individual Differences
data {
int<lower=1> n;
int<lower=1> p;
int<lower=0,upper=n> k[p];
}
parameters {
real<lower=0,upper=1> phi;
real<lower=0,upper=1> mubon;
real<lower=0> mudiff;
real<lower=40,upper=800> lambdabon;
real<lower=4,upper=100> lambdamal;
matrix<lower=0,upper=1>[p,2] theta;
}
transformed parameters {
vector[2] lp_parts[p];
vector<lower=0>[2] alpha;
vector<lower=0>[2] beta;
real<lower=0,upper=1> mumal;
// Additivity on Logit Scale
mumal <- inv_logit(logit(mubon) - mudiff);
// Transformation to Group Mean and Precision
alpha[1] <- mubon * lambdabon;
beta[1] <- lambdabon * (1 - mubon);
alpha[2] <- mumal * lambdamal;
beta[2] <- lambdamal * (1 - mumal);
// Data are Binomial with Rate Given by
// Each Person’s Group Assignment
for (i in 1:p) {
lp_parts[i,1] <- log1m(phi) + binomial_log(k[i], n, theta[i,1]);
lp_parts[i,2] <- log(phi) + binomial_log(k[i], n, theta[i,2]);
}
}
model {
// Priors
mubon ~ beta(1, 1); // can be removed
mudiff ~ normal(0, 1 / sqrt(.5))T[0,]; // Constrained to be Positive
// Relatively Uninformative Prior on Base Rate
phi ~ beta(5, 5);
for (i in 1:p)
theta[i] ~ beta(alpha, beta);
for (i in 1:p)
increment_log_prob(log_sum_exp(lp_parts[i]));
}
generated quantities {
int<lower=0,upper=1> z[p];
for (i in 1:p) {
vector[2] prob;
prob <- softmax(lp_parts[i]);
// Each Person Belongs to One of Two Latent Groups
z[i] <- bernoulli_rng(prob[2]);
}
}"
k <- c(45, 45, 44, 45, 44, 45, 45, 45, 45, 45, 30,
20, 6, 44, 44, 27, 25, 17, 14, 27, 35, 30)
p <- length(k) # number of people
n <- 45 # number of questions
data <- list(p=p, k=k, n=n) # To be passed on to Stan
myinits <- list(
list(phi=.5, mubon=.5, mudiff=1, lambdabon=400, lambdamal=50,
theta=matrix(rep(.5, p * 2), p, 2)))
# Parameters to be monitored
parameters <- c("mubon", "lambdabon", "mumal", "lambdamal", "mudiff",
"phi", "theta", "z")
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=3000,
chains=1,
thin=1,
# warmup = 100, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
print(samples, digits=3)
In [141]:
plot(samples)
In [157]:
# Cheating Latent Mixture Model
s <- "
model{
# Each Person Belongs to One of Two Latent Groups
for (i in 1:p){
z[i] ~ dbern(phi) # phi is the Base Rate
z1[i] <- z[i]+1
}
# Relatively Uninformative Prior on Base Rate
phi ~ dbeta(5,5)
# Data are Binomial with Rate Given by
# Each Person뭩 Group Assignment
for (i in 1:p){
k[i] ~ dbin(theta[i,z1[i]],n)
thetatmp[i,1] ~ dbeta(alpha[1],beta[1])
theta[i,1] <- max(.01,min(.99,thetatmp[i,1]))
thetatmp[i,2] ~ dbeta(alpha[2],beta[2])
theta[i,2] <- max(.01,min(.99,thetatmp[i,2]))
}
# Transformation to Group Mean and Precision
alpha[1] <- mubon * lambdabon
beta[1] <- lambdabon * (1-mubon)
# Additivity on Logit Scale
logit(muche) <- logit(mubon) + mudiff # Note the +
alpha[2] <- muche * lambdache
beta[2] <- lambdache * (1-muche)
# Priors
mubon ~ dbeta(1,1)
mudiff ~ dnorm(0,0.5)T(0,) # Constrained to be Positive
lambdabon ~ dunif(5,50)
lambdache ~ dunif(5,50)
# Correct Count
for (i in 1:p){
pct[i] <- equals(z[i],truth[i])
}
pc <- sum(pct[1:p])
}
"
In [158]:
cheat.dat <- read.table("cheat.csv",header=F,sep=",")
In [159]:
cheatt.dat <- read.table("cheatt.csv",header=F,sep="")
In [160]:
truth <- cheatt.dat$V1 #truth = 1 if cheater
truth
Out[160]:
In [161]:
k <- apply(cheat.dat,1,sum) # total correct per participant
p <- length(k) # number of people
n <- 40 # total trials
data <- list("p", "k", "n", "truth") # to be passed on to JAGS
myinits <- list(
list(z = round(runif(p)), mudiff=0.1, phi=0.5, mubon=0.5, lambdabon=30, lambdache=25),
list(z = round(runif(p)), mudiff=0.15, phi=0.5, mubon=0.5, lambdabon=25, lambdache=30)
)
#myinits <- list(
# list(z = round(runif(p)), mudiff=0.1, phi=0.5, mubon=0.5, lambdabon=10, lambdache=10),
# list(z = round(runif(p)), mudiff=0.2, phi=0.5, mubon=0.5, lambdabon=10, lambdache=10)
# )
# parameters to be monitored:
parameters <- c("theta","z","mubon","lambdabon",
"muche","lambdache","mudiff","phi","alpha","beta","pc")
set.seed(3) # some chains result in "undefined real result -- see box & exercise"
# The following command calls JAGS with specific options.
# For a detailed description see the R2jags documentation.
samples = jags(data, inits=myinits, parameters,
textConnection(s), n.chains=2, n.iter=6000,
n.burnin=3000, n.thin=1, DIC=T)
In [156]:
samples$summary
Out[156]:
In [162]:
pc <- samples$BUGSoutput$sims.list$pc/p #to get proportion correct
mean(pc)
Out[162]:
In [164]:
# plot 6.9
#make the two panel plot:
#windows(width=8,height=6) #this command works only under Windows!
layout(matrix(c(1,2),2,1))
layout.show(2)
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
bins <- c(-1:n)+.5
bonafide <- hist(k[truth==0], breaks=bins, plot=F)$counts
cheat <- hist(k[truth==1], breaks=bins, plot=F)$counts
counts <- rbind(bonafide, cheat)
barplot(counts, main=" ", xlab=" ", col=c("grey","white"),
legend.text = c("Bona Fide","Cheater"), args.legend = list(x="topleft"),
beside=TRUE, axes=F)
# bottom panel:
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
pc.line <- array()
for (i in 1:41)
{
pc.line[i] <- mean((k>=(i-1))==truth)
}
In [165]:
#dev.new() # so the plot below does not overwrite the plot above
plot(c(0:40), pc.line, type="l", lwd=2, xlim=c(0,40), ylim=c(0.4,1),
xlab="Number of Items Recalled Correctly",
ylab=" ", axes=F)
axis(1, at=c(0,seq(from=5,by=5,to=40)))
axis(2, at=c(.5,.75,1))
par(las=0)
mtext("Prop. Correct",side=2, line=2.5,cex=1.5)
# Now add the distribution:
pc.dens <- density(pc)
polygon(c(0,pc.dens$y,0,0), c(pc.dens$x[1]-.01,pc.dens$x,pc.dens$x[1]+.01,pc.dens$x[1]-.01), col="green")
In [166]:
# plot 6.10
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
plot(k,samples$BUGSoutput$mean$z,ylim=c(0,1),xlim=c(0,n), xlab= "Number of Items Recalled Correctly",
ylab="Cheater Classification", lwd=2, pch=4)
#in the code, z=0 is bonafide and z=1 is cheating
#so z gives the prob of being assigned to cheating group
In [167]:
model <- "
// Cheating Latent Mixture Model
data {
int<lower=1> n;
int<lower=1> p;
int<lower=1,upper=n> k[p];
int<lower=0,upper=1> truth[p];
}
parameters {
real<lower=0,upper=1> phi;
real<lower=0,upper=1> mubon;
real<lower=0> mudiff;
real<lower=5,upper=50> lambdabon;
real<lower=5,upper=50> lambdache;
vector<lower=0,upper=1>[p] theta;
}
transformed parameters {
vector[2] lp_parts[p];
vector<lower=0>[2] alpha;
vector<lower=0>[2] beta;
real<lower=0,upper=1> muche;
// Additivity on Logit Scale
muche <- inv_logit(logit(mubon) + mudiff);
// Transformation to Group Mean and Precision
alpha[1] <- mubon * lambdabon;
beta[1] <- lambdabon * (1 - mubon);
alpha[2] <- muche * lambdache;
beta[2] <- lambdache * (1 - muche);
// Data are Binomial with Rate Given by
// Each Person’s Group Assignment
for (i in 1:p) {
lp_parts[i,1] <- log1m(phi) + beta_log(theta[i], alpha[1], beta[1]);
lp_parts[i,2] <- log(phi) + beta_log(theta[i], alpha[2], beta[2]);
}
}
model {
// Priors
mubon ~ beta(1, 1); // can be removed
mudiff ~ normal(0, 1 / sqrt(.5))T[0,]; // Constrained to be Positive
// Relatively Uninformative Prior on Base Rate
phi ~ beta(5, 5);
for (i in 1:p)
increment_log_prob(log_sum_exp(lp_parts[i]));
k ~ binomial(n, theta);
}
generated quantities {
int<lower=0,upper=1> z[p];
real pc;
vector[p] pct;
for (i in 1:p) {
vector[2] prob;
prob <- softmax(lp_parts[i]);
// Each Person Belongs to One of Two Latent Groups
z[i] <- bernoulli_rng(prob[2]);
// Correct Count
pct[i] <- if_else(z[i] == truth[i], 1, 0);
}
pc <- sum(pct);
}"
cheat.dat <- read.table("cheat.csv", header=F, sep=",")
cheatt.dat <- read.table("cheatt.csv", header=F, sep="")
truth <- cheatt.dat$V1 # truth = 1 if cheater
k <- apply(cheat.dat, 1, sum) # total correct per participant
p <- length(k) # number of people
n <- 40 # total trials
data <- list(p=p, k=k, n=n, truth=truth) # To be passed on to Stan
myinits <- list(
list(mudiff=.1, phi=.5, mubon=.5, lambdabon=30, lambdache=25,
theta=rep(.5, p)),
list(mudiff=.15, phi=.5, mubon=.5, lambdabon=25, lambdache=30,
theta=rep(.5, p)))
# Parameters to be monitored:
parameters <- c("theta", "z", "mubon", "lambdabon", "muche", "lambdache",
"mudiff", "phi", "alpha", "beta", "pc")
# The following command calls Stan with specific options.
# For a detailed description type "?rstan".
samples <- stan(model_code=model,
data=data,
init=myinits, # If not specified, gives random inits
pars=parameters,
iter=1000,
chains=2,
thin=1,
warmup = 200, # Stands for burn-in; Default = iter/2
# seed = 123 # Setting seed; Default is random seed
)
# Now the values for the monitored parameters are in the "samples" object,
# ready for inspection.
In [168]:
print(samples)
In [169]:
plot(samples)
In [170]:
pc <- extract(samples)$pc / p # to get proportion correct
mean(pc)
Out[170]:
In [178]:
# plot 6.9
#make the two panel plot:
#windows(width=8,height=6) #this command works only under Windows!
layout(matrix(c(1,2),2,1))
layout.show(2)
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
bins <- c(-1:n)+.5
bonafide <- hist(k[truth==0], breaks=bins, plot=F)$counts
cheat <- hist(k[truth==1], breaks=bins, plot=F)$counts
counts <- rbind(bonafide, cheat)
barplot(counts, main=" ", xlab=" ", col=c("grey","white"),
legend.text = c("Bona Fide","Cheater"), args.legend = list(x="topleft"),
beside=TRUE, axes=F)
# bottom panel:
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
pc.line <- array()
for (i in 1:41) {
pc.line[i] <- mean((k>=(i-1))==truth)
}
In [172]:
#dev.new() # so the plot below does not overwrite the plot above
plot(c(0:40), pc.line, type="l", lwd=2, xlim=c(0,40), ylim=c(0.4,1),
xlab="Number of Items Recalled Correctly",
ylab=" ", axes=F)
axis(1, at=c(0,seq(from=5,by=5,to=40)))
axis(2, at=c(.5,.75,1))
par(las=0)
mtext("Prop. Correct",side=2, line=2.5,cex=1.5)
# Now add the distribution:
pc.dens <- density(pc)
polygon(c(0,pc.dens$y,0,0), c(pc.dens$x[1]-.01,pc.dens$x,pc.dens$x[1]+.01,
pc.dens$x[1]-.01), col="green")
In [173]:
# plot 6.10
#windows()
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
plot(k,summary(samples)$summary[paste("z[", 1:118, "]", sep=""), 1], ylim=c(0,1),
xlim=c(0,n), lwd=2, pch=4, xlab= "Number of Items Recalled Correctly",
ylab="Cheater Classification")
# in the code, z=0 is bonafide and z=1 is cheating
# so z gives the prob of being assigned to cheating group