Hamiltonian Monte Carlo on the GPU

In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3. We develop an matrix expression GPU expression library and use it to perform Hamiltonian Monte Carlo simulation on the GPU. The highlights of the notebook are:

  • A light weight GPU matrix expression library which is capable to do elementwise kernel fusion to minimize creation of temporaries and reduce number of GPU kernels to be called

  • Using the power of F# computation expression and opeator overloading to get concise syntax close to mathematical notation

  • Showing how to develop Hamiltonian Monte Carlo from scratch and apply it to sample the posterior distribution of a multinomial regression model

  • Illustrating the concpets with the MNIST data set

Background

Bayesian inference requires algorithms capable to fit models with thousands of parameters and complex nonlinearities. Hamiltonian Monte Carlo initially introduced by Duan, Kennedy, Pendleton and Roweth in 1987 has proven to be extremely successful. Hamiltonian Monte Carlo (HMC) is a variation of the Markov Chain Monte Carlo (MCMC) methods which use the derivative of the posterior distribution to select the step direction and size. Therefore, HMC preferentially choose directions of increasing probability.

There are several papers explaining HMC in detail.

  • Radford M. Neal: MCMC using Hamiltonian dynamics, Handbook of Markov Chain Monte Carlo, Chapter 5, 2011
  • Michael Betancourt, Simon Byrne, Sam Livingstone, and Mark Girolami: The Geometric Foundations of Hamiltonian Monte Carlo, 2014
  • Michael Betancourt: A Conceptual Introduction to Hamiltonian Monte Carlo, 2017

Let's Get Started!

Before you continue you should install the F# kernel for Jupyter.

We use Paket to get the newest version 3.0.3 beta2 NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.

Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.


In [1]:
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]


Out[1]:
<null>

In [2]:
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"

In [3]:
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly


Inspect the GPU

We first check what GPU is available. As we only need one GPU for our computations, we select the default GPU.


In [4]:
open System
open System.IO
open System.IO.Compression
open System.Linq
open System.Threading.Tasks
open System.Net
open System.Diagnostics
open System.Collections.Generic
open Microsoft.FSharp.Quotations
open Alea
open Alea.CSharp
open Alea.Parallel
open Alea.cuBLAS
open Alea.cuRAND

let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
           gpu
           gpu.Device.Cores
           ((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
           (IntPtr.Size * 8)
           
{ Html = s }


Out[4]:
GPU is [0|6.0|Tesla P100-PCIE-16GB], Number of Cores 3584, GPU DRAM is 15.879 GB, Process is 64 bit

Utilities


In [5]:
let divUp num den = (num + den - 1) / den

let array2DAsLists (a:'T[,]) =
    List.init (a.GetLength 0) (fun i -> List.init (a.GetLength 1) (fun j -> a.[i, j]))

// flatten a 2D array to an array in column major storage order
let flattenArray2D (a:'T[,]) =
    let numRows = a.GetLength 0
    let numCols = a.GetLength 1
    let aflat = Array.zeroCreate (numRows*numCols)
    let mutable k = 0
    for j = 0 to numCols - 1 do
        for i = 0 to numRows - 1 do
            aflat.[k] <- a.[i, j]
            k <- k + 1
    aflat

// get a column 
let columnVec (a:'T[,]) col =
    let numRows = a.GetLength 0
    let columnVec = Array.zeroCreate numRows
    for i = 0 to numRows - 1 do
        columnVec.[i] <- a.[i, col]
    columnVec
    
// reconstruct 2D array from a flat array in column major storage order 
let array2DColMajor numRows numCols (aflat:'T[]) =
    let a = Array2D.zeroCreate numRows numCols
    let mutable k = 0
    for j = 0 to numCols - 1 do
        for i = 0 to numRows - 1 do
            a.[i, j] <- aflat.[k]
            k <- k + 1
    a

// reconstruct 2D array from a flat array in row major storage order 
let array2DRowMajor numRows numCols (aflat:'T[]) =
    let a = Array2D.zeroCreate numRows numCols
    let mutable k = 0
    for i = 0 to numRows - 1 do
        for j = 0 to numCols - 1 do
            a.[i, j] <- aflat.[k]
            k <- k + 1
    a
    
// matrix product a*b with a of dimension m x k, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyCpu m n k (a:'T[]) (b:'T[]) =
    let lda = m
    let ldb = k
    let ldc = m
    let c = Array.zeroCreate (m*n)
    for row = 0 to m - 1 do
        for col = 0 to n - 1 do
            let mutable sum = LanguagePrimitives.GenericZero
            for l = 0 to k - 1 do 
                sum <- sum + a.[row + l*lda] * b.[l + col*ldb]
            c.[row + col*ldc] <- sum
    c

// matrix product a^t*b with a of dimension k x m, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyTranspCpu m n k (a:'T[]) (b:'T[]) =
    let lda = k
    let ldb = k
    let ldc = m
    let c = Array.zeroCreate (m*n)
    for row = 0 to m - 1 do
        for col = 0 to n - 1 do
            let mutable sum = LanguagePrimitives.GenericZero
            for l = 0 to k - 1 do 
                sum <- sum + a.[l + row*lda] * b.[l + col*ldb]
            c.[row + col*ldc] <- sum
    c
    
// transpose a matrix in Fortran column major storage format
let inline transpose numRows numCols (aflat:'T[]) =
    let atflat = Array.zeroCreate (numRows*numCols)
    for i = 0 to numRows - 1 do 
        for j = 0 to numCols - 1 do
            atflat.[i*numCols + j] <- aflat.[j*numRows + i] 
    atflat

let inline error (a:'T[]) (b:'T[]) = Array.map2 (fun a b -> abs (a - b)) a b |> Array.max

GPU Matrix Expression Library


In [6]:
let [<Literal>] UseFortranOrder = true

type Session(gpu:Gpu) =
    inherit DisposableObject()

    let session = new Alea.Session(gpu)
    let blas = new cuBLAS.Blas(gpu)
    let rand = cuRAND.Generator.CreateGpu(gpu, cuRAND.RngType.PSEUDO_DEFAULT)

    let memoryPoolAll = List<DeviceMemory<byte>>()
    let memoryPoolBin = List<DeviceMemory<byte>>()
    let mutable disposed = false

    override this.Dispose(disposing) =
        if disposing then
            lock this <| fun _ ->
                if not disposed then
                    memoryPoolAll |> Seq.iter (fun memory -> memory.Dispose())
                    memoryPoolAll.Clear()
                    memoryPoolBin.Clear()
                    blas.Dispose()
                    rand.Dispose()
                    session.Dispose()
                    disposed <- true
        base.Dispose(disposing)

    member this.Gpu = gpu
    member this.Session = session
    member this.BLAS = blas
    member this.RAND = rand

    member this.Allocate<'T>(size:int) =
        lock this <| fun _ ->
            if not disposed then
                let size = (nativeint size) * (nativeint (Gpu.SizeOf<'T>()))
                let idx = memoryPoolBin.FindIndex(fun memory -> memory.Size >= size)
                if idx < 0 then // not found
                    let memory = gpu.AllocateDevice<byte>(size)
                    memoryPoolAll.Add(memory)
                    deviceptr<'T>(memory.Ptr.Handle)
                else
                    let memory = memoryPoolBin.[idx]
                    memoryPoolBin.RemoveAt(idx)
                    deviceptr<'T>(memory.Ptr.Handle)
            else failwith "Session is disposed!"

    member this.Release(ptr:deviceptr<'T>) =
        lock this <| fun _ ->
            if not disposed then
                let idx = memoryPoolAll.FindIndex(fun memory -> memory.Handle = ptr.Handle)
                if idx >= 0 then // find
                    let memory = memoryPoolAll.[idx]
                    let compare = System.Comparison<DeviceMemory<byte>>
                                    (fun memory1 memory2 ->
                                        let x = memory1.Size
                                        let y = memory2.Size
                                        if x < y then -1 elif x > y then 1 else 0)
                    memoryPoolBin.Add(memory)
                    memoryPoolBin.Sort(compare)

type Calculation<'T>(f:Session -> 'T) =
    member this.Invoke b = f b

type CalculationBuilder() =
    member this.Bind(x:Calculation<'A>, res:'A -> Calculation<'B>) =
        Calculation(fun b -> let r = x.Invoke(b) in res(r).Invoke(b))
    member this.Return(x:'A) = Calculation(fun b -> x)
    member this.ReturnFrom(x:'A) = x
    member this.Zero() = Calculation(fun b -> ())
    member this.For(elements:seq<'A>, forBody:'A -> Calculation<unit>) =
        Calculation(fun b -> elements |> Seq.iter (fun e -> (forBody e).Invoke(b); GC.Collect(); GC.WaitForPendingFinalizers()))
    member this.Combine(partOne:Calculation<unit>, partTwo:Calculation<'A>) =
        Calculation(fun b -> partOne.Invoke(b); partTwo.Invoke(b))
    member this.Delay(restOfComputation:unit -> Calculation<'A>) =
        Calculation(fun b -> restOfComputation().Invoke(b))
    member this.Using<'A, 'B when 'A :> IDisposable>(x:'A, res:'A -> Calculation<'B>) =
        Calculation(fun b -> try res(x).Invoke(b) finally x.Dispose())

let gpuRun<'T> session (calculation:Calculation<'T>) =
    calculation.Invoke session 

let gpucalc = CalculationBuilder()

type GpuMatFill<'T> =
    | Undefined
    | PreAllocated of dmem:DeviceMemory<'T>
    | FillFunc of fill:(deviceptr<'T> -> unit)
    | Elementwise of emit:Func<int, 'T>
    | Transpose of mat:GpuMat<'T>

    member this.Fill (session:Alea.Session) (rows:int) (cols:int) (ptr:deviceptr<'T>) =
        match this with
        | Undefined -> ()
        | PreAllocated _ -> ()
        | FillFunc fill -> fill ptr
        | Elementwise emit -> session.For(0, rows*cols, (fun i -> ptr.Set(i, emit.Invoke i)))
        | Transpose mat -> failwith "Transpose is TODO."

and GpuMat<'T>(session:Session, rows:int, cols:int, initFill:GpuMatFill<'T>) =
    inherit DisposableObject()
    let size = rows * cols
    let mutable _fill = initFill
    let mutable ptrOpt =
        match _fill with
        | PreAllocated memory -> Some memory.Ptr
        | _ -> None

    override this.Dispose(disposing) =
        match ptrOpt with 
        | Some ptr -> 
            session.Release(ptr)
            ptrOpt <- None
        | _ -> ()
        GC.KeepAlive(initFill)
        base.Dispose(disposing)

    member inline this.HasMemory = ptrOpt.IsSome

    member inline this.Fill (ptr:deviceptr<'T>) =
        _fill.Fill session.Session rows cols ptr

    member inline this.Ptr =
        match ptrOpt with
        | Some ptr -> ptr
        | None ->
            let ptr = session.Allocate<'T>(size)
            this.Fill ptr
            // this memory is owned by this mat, so we change the fill method
            _fill <- Elementwise (Func<_,_> (fun i -> ptr.[i])) 
            ptrOpt <- Some ptr
            ptr

    member inline this.ElementwiseEmit (targetRows:int) (targetCols:int) =
        if rows = targetRows && cols = targetCols then            
            match _fill with
            | Elementwise emit -> emit
            | _ -> 
                let ptr = this.Ptr
                Func<_,_> (fun i -> ptr.[i])
        elif rows = 1 && cols = 1 then
            match _fill with
            | Elementwise emit -> Func<_,_> (fun i -> emit.Invoke 0)
            | _ ->
                let ptr = this.Ptr
                Func<_,_> (fun i -> ptr.[0])
        else
            failwithf "Broadcast from (%d,%d) to (%d,%d) isn't supported." rows cols targetRows targetCols

    member inline this.MatrixForBlas =
        match UseFortranOrder with
        | true ->
            match _fill with
            | Transpose mat -> mat.Ptr, cuBLAS.Operation.T, this.Cols
            | _ -> this.Ptr, cuBLAS.Operation.N, this.Rows
        | false ->
            failwith "TODO: Add a kernel to do transpose"

    member this.Session = session
    member this.Gpu = session.Gpu
    member this.Rows = rows
    member this.Cols = cols
    member this.Size = size

    member this.T =
        let fill = Transpose this
        new GpuMat<'T>(session, cols, rows, fill)

    member this.ToHost() =
        let ptr = this.Ptr
        if not UseFortranOrder || rows = 1 || cols = 1 then
            let array = Array2D.zeroCreate<'T> rows cols
            Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
            array
        else
            // we need transpose the matrix to get C order for
            let array = Array2D.zeroCreate<'T> cols rows
            Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
            Array2D.init rows cols (fun row col -> array.[col, row])

    static member inline (.<-) (lhs:GpuMat<float>, rhs:GpuMat<float>) =
        rhs.Fill lhs.Ptr

    static member inline ElementwiseTransform(a:GpuMat<'T>, op:Func<'T, 'T>) =
        let session = a.Session
        let rows = a.Rows
        let cols = a.Cols
        let emitA = a.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let result = op.Invoke(a)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)

    static member inline ElementwiseTransform(a:GpuMat<'T>, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
        let session = a.Session
        let rows = max a.Rows b.Rows
        let cols = max a.Cols b.Cols
        let emitA = a.ElementwiseEmit rows cols
        let emitB = b.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let mutable b = emitB.Invoke i
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)

    static member inline ElementwiseTransform(a:GpuMat<'T>, b:'T, op:Func<'T, 'T, 'T>) =
        let session = a.Session
        let rows = a.Rows
        let cols = a.Cols
        let emitA = a.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let mutable b = b
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)

    static member inline ElementwiseTransform(a:'T, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
        let session = b.Session
        let rows = b.Rows
        let cols = b.Cols
        let emitB = b.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = a
            let mutable b = emitB.Invoke i
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)

    static member (~-) (a:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, (fun a -> -a))

    static member (.+) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))

    static member (.+) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))

    static member (.+) (a:GpuMat<float>, b:float) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))

    static member (.-) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))

    static member (.-) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))

    static member (.*) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))

    static member (.*) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))

    static member (./) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))

    static member (./) (a:GpuMat<float>, b:float) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))

    static member ( * ) (a:GpuMat<float>, b:GpuMat<float>) =
        let session = a.Session
        let rowsA = a.Rows
        let rowsB = b.Rows
        let rowsC = rowsA
        let colsA = a.Cols
        let colsB = b.Cols
        let colsC = colsB

        let fill = 
            match UseFortranOrder with
            | true ->
                FillFunc <| fun (cptr:deviceptr<float>) ->
                    let ldc = rowsC
                    let aptr, opA, lda = a.MatrixForBlas
                    let bptr, opB, ldb = b.MatrixForBlas
                    session.BLAS.Gemm(opA, opB, rowsC, colsC, rowsB, 1.0, aptr, lda, bptr, ldb, 0.0, cptr, ldc)
                    GC.KeepAlive(a)
                    GC.KeepAlive(b)
            | false ->
                failwith "TODO"

        new GpuMat<float>(session, rowsC, colsC, fill)

let undef (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        new GpuMat<float>(session, rows, cols, Undefined)

let zeros (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = Elementwise (Func<_,_> (fun i -> 0.0))
        new GpuMat<float>(session, rows, cols, fill)

let ones (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = Elementwise (Func<_,_> (fun i -> 1.0))
        new GpuMat<float>(session, rows, cols, fill)

let map (gmat:GpuMat<'T>) (op:Func<'T, 'T>) =
    GpuMat<'T>.ElementwiseTransform(gmat, op)

let gpuLog (gmat:GpuMat<float>) =
    GpuMat<float>.ElementwiseTransform(gmat, Func<_,_> log)

let reduce (mat:GpuMat<'T>) (op:Func<'T, 'T, 'T>) =
    let session = mat.Session
    let fill = GpuMatFill<'T>.FillFunc <| fun (ptr:deviceptr<'T>) ->
        let emit = mat.ElementwiseEmit mat.Rows mat.Cols
        session.Session.Aggregate(mat.Size, emit, (fun v -> ptr.Set(0, v)), op)
        GC.KeepAlive(mat)
    new GpuMat<'T>(session, 1, 1, fill)

let sum (mat:GpuMat<float>) =
    reduce mat (Func<_,_,_> (+))

let randn (rows:int) (cols:int) (mean:float) (stddev:float) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = FillFunc <| fun (ptr:deviceptr<float>) ->
            session.RAND.GenerateNormal(ptr, (rows*cols) |> uint64, mean, stddev)
        new GpuMat<float>(session, rows, cols, fill)

let randnInplace (mat:GpuMat<float>) (mean:float) (stddev:float) =
    let session = mat.Session
    session.RAND.GenerateNormal(mat.Ptr, mat.Size |> uint64, mean, stddev)

let session () =
    Calculation<Session> <| fun session -> session

let toHost (mat:GpuMat<'T>) =
    mat.ToHost()

let head (mat:GpuMat<'T>) = 
    mat.ToHost().[0,0]

let toDevice (host:'T[,]) =
    let rows = host.GetLength 0
    let cols = host.GetLength 1
    Calculation<GpuMat<'T>> <| fun session ->
        match UseFortranOrder with
        | true ->
            let flattenedArray2D =
                let aflat = Array.zeroCreate (rows*cols)
                let mutable k = 0
                for j = 0 to cols - 1 do
                    for i = 0 to rows - 1 do
                        aflat.[k] <- host.[i, j]
                        k <- k + 1
                aflat
            let memory = session.Gpu.AllocateDevice(flattenedArray2D)
            let fill = PreAllocated memory
            new GpuMat<'T>(session, rows, cols, fill)
        | false -> 
            failwith "TODO for C order"

Using the Matrix Expression Library


In [7]:
let calc = gpucalc {
    let! a = ones 4 5
    let! b = ones 3 5

    let c = a * b.T

    return (toHost c) 
}

let session = new Session(Gpu.Default)
let result = calc |> gpuRun session 
result


Out[7]:
[[5.0; 5.0; 5.0]
 [5.0; 5.0; 5.0]
 [5.0; 5.0; 5.0]
 [5.0; 5.0; 5.0]]

In [8]:
let n = 100
let m = 50
let k = 75

let calc = gpucalc {
    let! A = randn m k 0.0 1.0
    let! B = randn k n 0.0 1.0
    let! C = undef m n

    C .<- A * B

    let hC = toHost C |> flattenArray2D
    let hhC = multiplyCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)

    return (error hC hhC)
}

let session = new Session(Gpu.Default)
let err = calc |> gpuRun session 
err


Out[8]:
1.065814104e-14

In [9]:
let n = 100
let m = 50
let k = 75

let calc = gpucalc {
    let! A = randn k m 0.0 1.0
    let! B = randn k n 0.0 1.0
    let! C = undef m n

    C .<- A.T * B

    let hC = toHost C |> flattenArray2D
    let hhC = multiplyTranspCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)

    return (error hC hhC)
}

let session = new Session(Gpu.Default)
let err = calc |> gpuRun session 
err


Out[9]:
1.065814104e-14

MNIST Dataset

Download and Prepare Dataset


In [10]:
let url = @"http://yann.lecun.com/exdb/mnist/"
let trainImagesFile = @"train-images-idx3-ubyte"
let trainLabelsFile = @"train-labels-idx1-ubyte"
let testImagesFile = @"t10k-images-idx3-ubyte"
let testLabelsFile = @"t10k-labels-idx1-ubyte"

let (@@) a b = System.IO.Path.Combine(a, b)

let decompress fileName =
    let fileToDecompress = new FileInfo(fileName + ".gz")  
    use originalFileStream = fileToDecompress.OpenRead()
    use decompressedFileStream = File.Create(fileName)
    use decompressionStream = new GZipStream(originalFileStream, CompressionMode.Decompress)
    decompressionStream.CopyTo(decompressedFileStream)

let get url file downloadTo =
    use client = new WebClient()
    client.DownloadFile(url + file + ".gz", (downloadTo @@ file) + ".gz")
    decompress (downloadTo @@ file)

let downLoad url downloadTo =
    let files = [trainImagesFile; trainLabelsFile; testImagesFile; testLabelsFile]
    Directory.CreateDirectory downloadTo |> ignore
    files |> List.iter (fun file -> if not <| File.Exists(downloadTo @@ file) then get url file downloadTo)
    downloadTo

let readData (brImages:BinaryReader) (brLabels:BinaryReader) numSamples =
    let npix, nclasses = 28, 10
    let images = Array2D.zeroCreate<float32> numSamples (npix*npix)
    let labels = Array2D.zeroCreate<float32> numSamples nclasses
    for i = 0 to numSamples - 1 do
        for x = 0 to npix - 1 do 
            for y = 0 to npix - 1 do 
                images.[i, x*npix + y] <- (brImages.ReadByte() |> float32)/255.0f
        labels.[i, brLabels.ReadByte() |> int] <- 1.0f
    images, labels

let skipImages (brImages:BinaryReader) =
    brImages.ReadInt32() |> ignore // skip magic
    brImages.ReadInt32() |> ignore // skip num images
    brImages.ReadInt32() |> ignore // skip rows
    brImages.ReadInt32() |> ignore // skip cols

let skipLabels (brLabels:BinaryReader) =
    brLabels.ReadInt32() |> ignore // skip magic
    brLabels.ReadInt32() |> ignore // skip num labels

let loadData path =
    use ifsTestLabels = new FileStream(path @@ testLabelsFile, FileMode.Open)
    use ifsTestImages = new FileStream(path @@ testImagesFile, FileMode.Open)
    use ifsTrainLabels = new FileStream(path @@ trainLabelsFile, FileMode.Open)
    use ifsTrainImages = new FileStream(path @@ trainImagesFile, FileMode.Open)
    use brTestLabels = new BinaryReader(ifsTestLabels)
    use brTestImages = new BinaryReader(ifsTestImages)
    use brTrainLabels = new BinaryReader(ifsTrainLabels)
    use brTrainImages = new BinaryReader(ifsTrainImages)          
    skipImages brTestImages
    skipLabels brTestLabels
    skipImages brTrainImages
    skipLabels brTrainLabels

    let numTrain = 55000
    let numTest = 10000
    let numValidation = 60000 - numTrain

    let testImages, testLabels = readData brTestImages brTestLabels numTest
    let trainImages, trainLabels = readData brTrainImages brTrainLabels numTrain
    let validationImages, validationLabels = readData brTrainImages brTrainLabels numValidation
    testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels
    
let path = downLoad url @"data/MNIST/"
let testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels = loadData path
trainImages.GetLength(0), trainImages.GetLength(1), trainLabels.GetLength(0), trainLabels.GetLength(1)


Out[10]:
(55000, 784, 55000, 10)

Visualize Dataset


In [11]:
let imageClass (imageLabels:float32[,]) i =
    List.init 10 (fun j -> imageLabels.[i, j]) |> List.findIndex (fun j -> j > 0.0f)

let imageArray2D (images:float32[,]) i =
    let npix = 28
    let img = Array2D.zeroCreate<float> npix npix
    for x = 0 to npix - 1 do 
        for y = 0 to npix - 1 do 
            img.[npix - 1 - x, y] <- images.[i, x*npix + y] |> float
    img

let imageList (images:float32[,]) i = 
    imageArray2D images i |> array2DAsLists

let stack npix (images:float[,] list) nx ny =
    if images.Length <> nx*ny then failwith "wrong dimensions"
    let image = Array2D.zeroCreate (nx*npix) (ny*npix)
    for i = 0 to nx - 1 do
        for j = 0 to ny - 1 do 
            for x = 0 to npix - 1 do 
                for y = 0 to npix - 1 do
                    image.[i*npix + x, j*npix + y] <- images.[i*ny + j].[x, y]
    image
    
let checkerBoard (images:float32[,]) from nx ny =
    let npix = 28
    let images = [from..(from+nx*ny-1)] |> List.map (fun i -> imageArray2D images i)
    stack npix images nx ny |> array2DAsLists
    
let heatmap img labels w h =
    let layout = Layout(title = sprintf "Image %A" labels)
    Heatmap(z = img, colorscale = "YIGnBu")
    |> Chart.Plot
    |> Chart.WithLayout layout
    |> Chart.WithWidth w
    |> Chart.WithHeight h

In [12]:
let i = 133
let img = imageList trainImages i
let c = imageClass trainLabels i
heatmap img c 500 500


Out[12]:

In [13]:
let i, nx, ny = 133, 5, 5
let img = checkerBoard trainImages i nx ny
let c = [i..i+nx*nx-1] |> List.map (fun i -> imageClass trainLabels i)
heatmap img c 800 800


Out[13]:

Hamiltonian Monte Carlo

Hamiltonian Monte Carlo (HMC) is a popular Markov Chain Monte Carlo method as it is highly efficent and widly applicable. The idea behind the algorithm is to propose new points by simulating the dynamics fo a particle using a potential engery landscape deduced from the target distribution. The simulation is done using the Hamiltonian dynamics formualtion, which results in several useful properties of the HMC method.

Hamiltonian Dynamics

In the Hamiltonian dynamics the state of a physical system is described by a pair $(q,p)$, where $q$ is the position vector and $p$ is the momentum vector. The evolution of the system over time is described by Hamilton's equations

$$ \begin{align} \frac{d q_i}{dt} &= \frac{\partial H}{\partial p_i}, \\ \frac{d p_i}{dt} &= - \frac{\partial H}{\partial q_i}, \end{align} $$

where $H(t,q,p)$ is the Hamiltonian of the system.

HMC is based on a frictionless particle governed by the potential energy $U(q)$ and kinetic energy $K(p)$. In this case the Hamiltionian is given by $H(q,p) = U(q) + K(p)$, which is constant in time. The kinetic energy is given by $$ K(p) = \frac{1}{2} p^{\top} M^{-1} p $$ with $M$ the mass matrix.

The key advantage of the Hamiltonian dynamics over other formulations is that solutions of Hamiltion's equations exhibit three important properties:

  1. Reversibility: For every $s > 0$ he mapping $T_s$ from a state $q(t), p(t)$ to a state at time $t+s$ is a bijection. Hence by running time backward we can determine previous states. This is achieved by negating both time derivatives in Hamilton's equations.

  2. Volume preservation: $T_s$ conserves volume in the $(q,p)$ space.

  3. Conservation of the Hamiltonian: We have $\frac{dH(q,p)}{dt} = 0$ hence the Hamiltonian is invariant in time.

All these properties are useful in the application of HMC. However, not all of them can be preserved in numerical solutions. The leap frog method generates numerical solutions which maintain the reversibility and the volume preservation and also approximately converve the Hamiltonian.

Given a step size $\varepsilon$ the leap from method performs discrete updates $$ \begin{align} p_{i,t+1/2} &= p_{i,t} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t}\right), \\ q_{i,t+1} &= q_{i,t} + \varepsilon \frac{\partial K}{\partial p}\left(p_{t+1/2}\right), \\ p_{i,t+1} &= p_{i,t+1/2} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t+1}\right). \end{align} $$ First a half step for the momentum variables is computed, which is then used for a full position update, followed by a second momentum half step.

Hamiltonian Monte Carlo Algorithm

To apply the Hamiltonian dynamics we have to specify an appropriate energy function. In statistical mechanics the probability density $f_{target}(q)$ for observing a particle at position $q$ with engergy $E(q)$ is $f(q) \propto \exp(-E(q))$.

Given a target distribution $f_{target}(q)$ we reverse this relationship to find the appropriate potential energy $U(q)$ by $$ U(q) = - \log(f_{target}(q)) $$ where we can drop constants. In particular $f_{target}(q)$ does not need to be a normalized probability distribution. Note that $U(q)$ is just the negative log-likelihood, which directs particle movement towards low negative log-likelihood. With this potential energy the Hamiltonian becomes $$ H(q,p) = U(q) + K(p) $$ where commonly $K(p) = \frac{1}{2} p^{\top} M^{-1} p$. The additive structure of the Hamiltonian implies that the canonical distribution of $(q,p)$ factorizes $$ p(q,p) \propto \exp(H(q,p)) \propto f_{target}(q) \exp(-K(p)). $$

The HMC algorithm works as follows: given a current state $s_{t-1} = (q_{t-1}, p_{t-1})$ do

  1. Sample new momentum $p^*_{t-1}$ from $f_{kin} \propto \exp(-K(p))$.
  2. Simulate Hamiltonian dynamics starting from state $s^*_{t-1} = (q_{t-1}, p^*_{t-1})$ e.g. using $L$ leapfrog steps, generating a new state $(q^*_t, p^*_t)$.
  3. Generating a proposal position-momentum state $s^*_t = (q^*_t, -p^*_t)$.
  4. Compute the acceptance probability $p_{acc} = \min (1, \exp(-H(s^*_t) + H(s^*_{t-1}))$.
  5. Accept the move from $s^*_{t-1}$ to $s^*_t$ with probability $p_{acc}$.

Note that the proposal distribution is symetric because of the momentum negation in the third step and the reversibiity of the integration method, which is needed in the Metropolis-Hastings acceptance probability calculation.

Bayesian Multinomial Regression

Let $$ x_i = (x_{i1}, \ldots, x_{ip})^{\top}, \quad y_i = (y_{i1}, \ldots, y_{ic})^{\top}, y_{ik} \in \{0,1\} $$ be predictors and class indicators. with $$ X = (x_1, \ldots, x_n)^{\top} \in \mathbb{R}^{n \times p}, \quad Y = (y_1, \ldots, y_n)^{\top} \in \mathbb{R}^{n \times c} $$
the matrix of predictors and class indicators. The goal is to classify new observations into one of the $c$ classes. Let $$ B = (\beta_1, \ldots \beta_c) \in \mathbb{R}^{p \times c} $$ be the matrix of regression coefficients, one column for each class. The relationship between $x_i$ and $y_i$ is modeled with the softmax function $\psi : \mathbb{R}^{n \times c} \rightarrow \mathbb{R}^{n \times c}$ $$ P(y_{ik} = 1 \mid x_i, B) = \psi(X B)_{ik} = \frac{\exp\left(x_i^{\top} \beta_k \right)}{\sum_{j=1}^c \exp\left(x_i^{\top} \beta_j\right)}, $$ which transforms each row of $X B$ into a multinomial probability. The log-likelihood for $n$ observations is then $$ l(B\mid X, Y) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log \psi(x_i, B). $$ For identifiability purpose we set $\beta_c = 0$, which is necessary because the multinomial probabilities add up to one.

It is common to select a Cauchy distribution as prior for the regression parameters $B$. Then the log posterior kernel is $$ \log(\kappa(B\mid X, Y)) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log(\psi(X B)_{ik}) + \sum_{i=1}^n \sum_{k=1}^p - \log (1 + \beta_{ik}^2). $$

This expression can be written entirly in terms of sum reduction, elementwise operation and matrix multiplication. If $\Gamma_{ik} = -\log(1 + \beta_{ik}^2)$. Then $$ \log(\kappa(B\mid X, Y)) = \mathrm{sum}(Y \odot \psi(X B)) + \mathrm{sum}(\Gamma) $$ wiht $\odot$ denoting elementwise multiplication. The same holds for the derivative with respect to the parameters $B$ $$ \frac{\partial \log(\kappa(B\mid X, Y))}{\partial B} = X^{\top} (Y - \psi(XB)) + \Gamma $$ Both expressions can be conveniently implemented on the GPU so that all the values remain on the GPU all the time.

Custom GPU Kernels

To actually implement the above expressions we need some custom GPU kernels.


In [14]:
let softmaxCpu numRows numCols (values:float32[]) =
    let output = Array.zeroCreate (numRows*numCols)
    for row = 0 to numRows-1 do
        let mutable output = values
        let mutable sum = 0.0f    
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            if sum = 0.0f then
                printfn "division by zero"
            output.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 
    output

[<ReflectedDefinition>]
let softmaxKernel32 numRows numCols (values:deviceptr<float32>) (results:deviceptr<float32>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    let mutable sum = 0.0f    
    if row < numRows then 
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 
            
[<ReflectedDefinition>]
let softmaxKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    let mutable sum = 0.0
    if row < numRows then 
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 

[<ReflectedDefinition>]
let argMaxRowKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then 
        let mutable maxVal = values.[row]
        let mutable argMax = 0
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then 
                maxVal <- v
                argMax <- col
        results.[row] <- float argMax

[<ReflectedDefinition>]
let predictionErrorKernel numRows numCols (y:deviceptr<float>) (prediction:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then 
        let col = prediction.[row] |> int
        results.[row] <- 1.0 - y.[col*numRows + row]
                     
[<ReflectedDefinition>]
let setColumnKernel numRows numCols (values:deviceptr<'T>) (results:deviceptr<'T>) col v = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then
         results.[col*numRows + row] <- v

Integrating Custom GPU Kernels with Matrix Expression Library


In [15]:
let addIntercept (a:float32[,]) =
    let aintercept = Array2D.zeroCreate (a.GetLength 0) (1 + a.GetLength 1)
    for i = 0 to a.GetLength(0) - 1 do
        for j = 0 to a.GetLength(1) do
            aintercept.[i, j] <- if j = 0 then 1.0f else a.[i, j-1]
    aintercept

let softmaxf (a:GpuMat<float32>) = 
    if UseFortranOrder then
        let fill (ptr:deviceptr<float32>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_>softmaxKernel32, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float32>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"

let softmax (a:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_> softmaxKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"

let argMaxRows (a:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_> argMaxRowKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"

let predictionError (y:GpuMat<float>) (prediction:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp y.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            y.Gpu.Launch(Action<_,_,_,_,_> predictionErrorKernel, lp, y.Rows, y.Cols, y.Ptr, prediction.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(y.Session, y.Rows, 1, fill)
    else
        failwith "softmax kernel requires Fortran storage order"

let setColumn (a:GpuMat<'T>) (col:int) v = 
    if UseFortranOrder then
        let fill (ptr:deviceptr<'T>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_,_,_> setColumnKernel, lp, a.Rows, a.Cols, a.Ptr, ptr, col, v)   
        let fill = FillFunc fill
        new GpuMat<'T>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"

let maskf (a:GpuMat<float32>) = setColumn a (a.Cols-1) 0.0f

let mask (a:GpuMat<float>) = setColumn a (a.Cols-1) 0.0

Hamiltonian Monte Carlo to Sample Model Posterior


In [16]:
// beta (p, c)
let cauchyPriorLogDensity (beta:GpuMat<float>) =
    sum (-gpuLog (1.0 .+ beta .* beta))

let multinomialLogLik (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
    sum (y .* gpuLog (softmaxVal .+ 1e-12))      

let priorGradBeta (beta:GpuMat<float>) = 
    -2.0 .* beta ./ (1.0 .+ beta .* beta)

let logLikGradBeta (x:GpuMat<float>) (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
    x.T * (y .- softmaxVal)  

// x (n, p), y (n, c)
let calc (random:Random) (hX:float[,]) (hY:float[,]) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose = gpucalc {
    let n = hX.GetLength 0
    let p = hX.GetLength 1
    let c = hY.GetLength 1

    let! x = toDevice hX 
    let! y = toDevice hY

    let! beta = randn p c 0.0 0.01

    let! momentum = undef p c
    let! softmaxVal = undef n c
    let! betaAv = zeros p c
    let! betaGrad = undef p c
    let! logLik = undef 1 1
    let! priorLogLik = undef 1 1
    let! momSquareSum = undef 1 1
    let! betaOld = undef p c
    let! softmaxAv = zeros n c
    let! predictions = undef n 1
    let! error = undef 1 1

    let hamiltonianMonteCarloSample eps =
        let t = System.Diagnostics.Stopwatch.StartNew()
        randnInplace momentum 0.0 1.0
        momentum .<- mask momentum

        softmaxVal .<- softmax (x * beta)
        logLik .<- multinomialLogLik y softmaxVal
        priorLogLik .<- cauchyPriorLogDensity beta

        momSquareSum .<- sum (momentum .* momentum)
        let initLogLik = head logLik 
        let currentK = head momSquareSum 
        let currentU = initLogLik + head priorLogLik 

        betaOld .<- beta
        betaGrad .<- (logLikGradBeta x y softmaxVal) .+ (priorGradBeta beta)
        betaGrad .<- mask betaGrad 

        momentum .<- momentum .+ (eps / 2.0) .* betaGrad

        for i = 0 to L-1 do
            beta .<- beta .+ eps .* momentum

            // update gradient
            betaGrad .<- (logLikGradBeta x y (softmax (x * beta))) .+ (priorGradBeta beta)
            betaGrad .<- mask betaGrad

            if i <> L - 1 then
                momentum .<- momentum .+ eps .* betaGrad

        momentum .<- momentum .+ (eps / 2.0) .* betaGrad
        softmaxVal .<- softmax (x * beta)

        logLik .<- multinomialLogLik y softmaxVal
        priorLogLik .<- cauchyPriorLogDensity beta

        let finalLogLik = head logLik 
        let proposedU = finalLogLik + head priorLogLik 

        momSquareSum .<- sum (momentum .* momentum)
        let proposedK = 0.5 * head momSquareSum 

        let diff = ((proposedU - proposedK) - (currentU - currentK)) / T
        let alpha = max diff 0.0

        let u = log(random.NextDouble()) 
        let result, accept = if u < alpha then beta, 1 else betaOld, 0

        t.Stop()
        if verbose then
            printfn "Current value of log-kernel %f" currentU
            printfn "Proposed value of log-kernel %f" proposedU     
            printfn "Current momentum %f" currentK
            printfn "Proposed momentum %f " proposedK      
            printfn "Total diff %A" diff
            printfn "Current log likelihood %f" initLogLik
            printfn "Proposed log-like %f" finalLogLik
            printfn "Alpha  %f, uniform random number %f" alpha u    
            printfn "----- %A -----" t.Elapsed

        result, accept  

    let hamiltonianMonteCarloSimulation () = 

        let rec burnIn n eps t accepted i =
            if i < n then 
                let beta, accept = hamiltonianMonteCarloSample epsBurnIn
                burnIn n eps (1.0 + t*annealRate) (accepted + accept) (i+1) 
            else
                accepted

        let acceptedBurnIn = burnIn nBurnIn epsBurnIn T 0 0

        let rec sample n eps t accepted (betas:float[,] list) i =
            if i < n then 
                let beta, accept = hamiltonianMonteCarloSample epsSim
                betaAv .<- betaAv .+ beta
                softmaxVal .<- softmax (x * beta)
                softmaxAv .<- softmaxAv .+ softmaxVal

                sample n eps t (accepted + accept) ((toHost beta) :: betas) (i+1)
            else
                List.rev betas, accepted

        let betas, accepted = sample nSamples epsSim 1.0 0 [] 0  
        
        betaAv .<- betaAv ./ (float nSamples)
        softmaxAv .<- softmaxAv ./ (float nSamples)
        predictions .<- argMaxRows softmaxAv
        error .<- sum (predictionError y predictions)
        let error = (head error) / (float y.Rows)
        let accuracy = 1.0 - error

        (toHost betaAv), betas, (toHost softmaxAv), accepted, error, accuracy

    return hamiltonianMonteCarloSimulation()
}

Run Simulation


In [33]:
let run () =
    let random = Random 42
    let verbose = true
    let nBurnIn = 100     // burn-in iterations before sampling starts
    let nSamples = 100    // number of posterior samples
    let L = 100
    let epsBurnIn = 1e-4  // step size during burning, typically larger than during sampling
    let epsSim = 1e-5
    let T = 1000.0
    let annealRate = 0.9

    let hX = addIntercept testImages 
    let hY = testLabels

    let calc = calc random (hX |> Array2D.map float) (hY |> Array2D.map float) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose

    use session = new Session(Gpu.Default)
    let betaAv, betas, softmaxAv, accepted, error, accuracy = gpuRun session calc
    betaAv, betas, softmaxAv, accepted, error, accuracy

In [34]:
let betaAv, betas, softmaxAv, accepted, error, accuracy = run()

In [39]:
accepted, accuracy


Out[39]:
(500, 0.8919)

In [19]:
let betaForClass (betaAv:float[,]) cls =
    let beta = columnVec betaAv cls |> Array.take (betaAv.GetLength(0) - 1) // chop off the intercept
    array2DColMajor 28 28 beta

In [37]:
let cls = 1
heatmap (betaForClass betaAv cls) (sprintf "Average Betas %d" cls) 500 500


Out[37]:

In [31]:
let images = List.init (betaAv.GetLength 1) (fun cls -> betaForClass betaAv cls)
let stacked = stack 28 images 5 2 |> array2DAsLists 
heatmap stacked "Average Betas" 500 1000


Out[31]:

In [ ]: