Example 3: Parameter Free Bayesian Kernel Density Estimation


In [1]:
using Distributions
using Gadfly
using RDatasets;

First, simulate some data from $p(x) = \frac{1}{2}\phi(x | 2, 0.25) + \frac{1}{2}\phi(x | 10, 1)$ [eq 28, reparametrized with precision rather than variance]:


In [2]:
# Look, this isn't world's most awesome function for sampling from a mixture model.  But it'll do.
TwoComponentGaussianMixtureSamples = function(n::Int64,
                                              w1::Float64, μ1::Float64, ϕ1::Float64,
                                              w2::Float64, μ2::Float64, ϕ2::Float64)
    if (w1 + w2 != 1.0) || (ϕ1 <= 0) || (ϕ2 <= 0) || n <= 0
        return []
    end
    
    x = zeros(n,1)
    σ1 = ϕ1^-0.5 # transform precision to standard deviation
    σ2 = ϕ2^-0.5
    for ii = 1:n
        if rand(Uniform()) <= w1
            x[ii] = rand(Normal(μ1, σ1))
        else
            x[ii] = rand(Normal(μ2, σ2))
        end
    end
    return x
end;

In [9]:
# sample 50 draws from this mixture
X = TwoComponentGaussianMixtureSamples(50, 0.5, 2., .25, 0.5, 10., 1.);

In [10]:
# cf the histogram underlying figure 6
plot(x=X, Geom.histogram(density=true, bincount=10), Guide.xlabel("x"), Guide.ylabel("density"))


Out[10]:
x -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 -25 -24 -23 -22 -21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 -40 -20 0 20 40 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 -0.25 -0.20 -0.15 -0.10 -0.05 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 -0.20 -0.19 -0.18 -0.17 -0.16 -0.15 -0.14 -0.13 -0.12 -0.11 -0.10 -0.09 -0.08 -0.07 -0.06 -0.05 -0.04 -0.03 -0.02 -0.01 0.00 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.10 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.19 0.20 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.30 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.40 0.41 -0.2 0.0 0.2 0.4 -0.20 -0.18 -0.16 -0.14 -0.12 -0.10 -0.08 -0.06 -0.04 -0.02 0.00 0.02 0.04 0.06 0.08 0.10 0.12 0.14 0.16 0.18 0.20 0.22 0.24 0.26 0.28 0.30 0.32 0.34 0.36 0.38 0.40 0.42 density

In [44]:
# beginning with Xobs, simulate an additional N-1 observations from the coherent predictive distribution 
genSamplePath = function(Xobs::Array{Float64}, N::Int64, τ::Float64)
    
    allX = vcat(Xobs, zeros(N-1,1))
    nobs = size(Xobs,1)
    
    # build up the η and ν parameters to ii = nobs
    η = 1
    ν1 = 1
    ν2 = 1
    for ii = 1:nobs
        ρ  = (N + nobs - ii) / (N + nobs - ii + 1)
        η  = η * ρ^-1
        ν1 = ν1 * (2 - ρ) / ρ^2
        ν2 = ν2 * ρ^-2
    end
    
    
    # now do the forward simulations
    for ii = (nobs + 1):(nobs + N - 1)
        u = sample(allX[1:(ii - 1)]) # could do a bunch of these in parallel      
        
        ρ  = (N + nobs - ii) / (N + nobs - ii + 1)
        η  = η * ρ^-1
        ν1 = ν1 * (2 - ρ) / ρ^2
        ν2 = ν2 * ρ^-2
        ν = ν1 - ν2
        μ = 2 * log(η - 1) - 0.5 * log(ν + (η - 1)^2)
        σ = sqrt(log(1 + ν / (η - 1)^2))
                    
        s = rand(LogNormal(μ, σ)) # could do a bunch of these in parallel too
        
        allX[ii,1] = rand(Normal(u, sqrt(τ * (s + 1)))) # and here too
    end

    return allX
end;

In [12]:
# given an array of points and a bandwidth parameter, return the associated Gaussian KDE function
genKDEfunction = function(Xobs::Array{Float64}, τ::Float64)
    n = size(Xobs, 1)
    kdef = function(x)
        d = zero(x)
        for ii = 1:n
            d = d .+ pdf(Normal(Xobs[ii], sqrt(τ)), x)
        end
        return d ./ n
    end
    return kdef
end;

In [13]:
genPosteriorCurves = function(Xobs::Array{Float64}, xgrid::FloatRange{Float64}, N::Int64, τ::Float64, nsamples::Int64)
    posterior_curves = zeros(size(xgrid,1),nsamples)
    for ii = 1:nsamples
        simX = genSamplePath(Xobs, N, τ)
        kdef = genKDEfunction(simX, τ)
        posterior_curves[:,ii] = kdef(xgrid)
    end
    return posterior_curves
end;

In [14]:
# Smush these curves into a stack of layers for plotting
genLayersForPlotting = function(xgrid::FloatRange{Float64}, posterior_curves::Array{Float64})
    layers = Layer[]
    nsamples = size(posterior_curves, 2)
    for ii = 1:nsamples
        push!(layers, layer(x=xgrid, y=posterior_curves[:,ii], Geom.line)[1])
    end
    return layers
end;

In [17]:
# overlay the sampled curves, the mean curve, with the original data histogram 
genKDEPlot = function(Xobs::Array{Float64}, xgrid::FloatRange{Float64}, posterior_curves::Array{Float64})
    layers = genLayersForPlotting(xgrid, posterior_curves)
    plot(layer(x=Xobs, Geom.histogram(density=true, bincount=10), 
         Theme(default_color=colorant"rgba(1,1,1,0.1)", bar_highlight=colorant"grey")),
     layer(x=xgrid, y=mean(posterior_curves, 2), Geom.line, Theme(default_color=colorant"black")),
     layers,
     Guide.xlabel("x"), Guide.ylabel("density"))
end;

In [45]:
# sample from the posterior distribution and generate figure 6
xgrid = -5:.1:15
N = 1000 # number additional points to simulate
τ = 0.08 # bandwidth parameter
nsamples = 1000 # number Monte Carlo draws to simulate
posterior_curves = genPosteriorCurves(X, xgrid, N, τ, nsamples)
genKDEPlot(X, xgrid, posterior_curves)


Out[45]:
x -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 -25 -24 -23 -22 -21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 -40 -20 0 20 40 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35