In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3. We demonstrate the new highly efficient GPU primitives that will come with our next release 3.1 of Alea GPU, such as highly optimized parallel sort, scan, reduce, copy if, stream compactification, merge, join, etc. We use the parallel-for and parallel aggretation method to implement a Monte Carlo simulation algorithm, which approximates the value of $\pi$.
Monte Carlo simulation methods are computational algorithms which rely on repeated random sampling to estimate a results. Monte Carlo simulation can be used to calculate the value of $\pi$ as follows
$$ \dfrac{\text{area of unit circle}}{\text{area of } [-1,1]^2} = \dfrac{\pi}{4} = \dfrac{\text{hits in unit circle}}{\text{randomly generated points in } [-1,1]^2}. $$More precisely if $A \subset \mathbb{R}^n$, $f : A \rightarrow \mathbb{R}$ is an integrable function and $x^{(i)} \in A$, $i=1, \ldots, n$ are uniformly distributed points then
$$ \intA f(x) dx \approx \frac{\mathrm{vol}(A)}{n} \sum{i=1}^n f(x^{(i)}).
To find the formula stated in the first equation apply the above equation with $A = [-1,1]^2$ and $f(x) = \mathbb{1}_{\{x_1^2 + x_2^2 \leq 1\}}$ the indicator function of the unit circle.
Before you continue you should install the F# kernel for Jupyter.
We use Paket to get the newest version 3.1 beta NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.
Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.
In [1]:
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]
Out[1]:
In [2]:
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"
In [3]:
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly
In [4]:
open System
open System.Threading.Tasks;
open Alea
open Alea.CSharp
open Alea.Parallel
let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
gpu
gpu.Device.Cores
((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
(IntPtr.Size * 8)
{ Html = s }
Out[4]:
Monte Carlo simulation experiments often rely on independent samples and are therefore embarassingly parallel. This means that we can implement them in two stages:
The function estimatePi
follows this paradigm and applies a technique called batch sampling. Generating random numbers in a parallel computing context requires some care: in order to get correct results it is important to assure that the generated random numbers do not overlap. This is achieved by jumping to the right position in the random stream with an offset
let offset = (uint64 batchSize) * (uint64 iBatch)
session.RandomUniform(points, offset)
Not every random number generator is designed for parallel execution. The interested reader can find more details in this paper.
After all the batches are completed and stored their mean values in the array pis
we copy this array to the CPU and perform a final average on the CPU.
In [5]:
let estimatePiTwoStage verbose rng seed batchSize batchs =
let t = System.Diagnostics.Stopwatch.StartNew()
use session = new Session(gpu)
session.UsingPseudoRandom(rngType=rng, seed=seed)
let points = session.Allocate<double2> batchSize
let values = session.Allocate<float> batchSize
let pis = session.Allocate<float> batchs
for iBatch = 1 to batchs do
session.RandomUniform(points)
session.For(0, batchSize, (fun i ->
let point = points.[i]
let d = point.x * point.x + point.y * point.y
values.[i] <- if d < 1.0 then 4.0 else 0.0))
session.Aggregate(batchSize,
(fun i -> values.[i]),
(fun value -> pis.[iBatch - 1] <- value / (float batchSize)),
(fun a b -> a + b))
let pis = Gpu.CopyToHost pis
let pi = pis |> Array.average
t.Stop()
if verbose then
printfn "%A" pis
printfn "PI = %f, %A" pi t.Elapsed
pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length)
In [19]:
let estimatePiCompact verbose rng seed batchSize batchs =
let t = System.Diagnostics.Stopwatch.StartNew()
use session = new Session(gpu)
session.UsingPseudoRandom(rngType=rng, seed=seed)
let points = session.Allocate<double2> batchSize
let pis = session.Allocate<float> batchs
for iBatch = 1 to batchs do
session.RandomUniform(points)
session.Aggregate(batchSize,
(fun i ->
let point = points.[i]
let d = point.x * point.x + point.y * point.y
if d < 1.0 then 4.0 else 0.0),
(fun value -> pis.[iBatch - 1] <- value / (float batchSize)),
(fun a b -> a + b))
let pis = Gpu.CopyToHost pis
let pi = pis |> Array.average
t.Stop()
if verbose then
printfn "%A" pis
printfn "PI = %f, %A" pi t.Elapsed
pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length)
In [20]:
let estimatePiCpu verbose rngType seed batchSize batchs =
let t = System.Diagnostics.Stopwatch.StartNew()
let points = Array.zeroCreate<float> (2*batchSize)
let values = Array.zeroCreate<float> batchSize
let pis = Array.zeroCreate<float> batchs
use rng = cuRAND.Generator.CreateCpu rngType
for iBatch = 1 to batchs do
// skip ahead to the right offset to generate non-overlapping blocks of random numbers
let offset = 2UL * (uint64 batchSize) * (uint64 iBatch)
rng.SetGeneratorOffset offset
rng.GenerateUniform(points)
Parallel.For(0, batchSize, (fun i ->
let pointx = points.[i]
let pointy = points.[batchSize + 1]
let d = pointx * pointx + pointy * pointy
values.[i] <- if d < 1.0 then 4.0 else 0.0)) |> ignore
pis.[iBatch - 1] <- values |> Array.average
let pi = pis |> Array.average
t.Stop()
if verbose then
printfn "%A" pis
printfn "PI = %f, %A" pi t.Elapsed
pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length)
In [22]:
let estimatePi batches batchSize impl rng =
let cuRANDType () =
match rng with
| PseudoRandomType.XORWOW -> cuRAND.RngType.PSEUDO_XORWOW
| PseudoRandomType.MRG32K3A -> cuRAND.RngType.PSEUDO_MRG32K3A
| PseudoRandomType.PHILOX4_32_10 -> cuRAND.RngType.PSEUDO_PHILOX4_32_10
| _ -> failwith "TODO"
let seed = 1UL
let pi, t, l =
match impl with
| "gpu compact" -> estimatePiCompact false rng seed batchSize batches
| "gpu two stage" -> estimatePiTwoStage false rng seed batchSize batches
| "cpu" -> estimatePiCpu false (cuRANDType()) seed batchSize batches
| _ -> failwithf "unknown implementation %s" impl
pi, t, l, rng, impl
In [24]:
let batches, batchSize = if Gpu.Default.Device.Cores <= 512 then 50, 1000000 else 50, 10000000
// JIT to get more accurate timings
estimatePi batches batchSize "gpu compact" PseudoRandomType.XORWOW
estimatePi batches batchSize "gpu two stage" PseudoRandomType.XORWOW
let results =
[
estimatePi batches batchSize "gpu compact" PseudoRandomType.XORWOW
estimatePi batches batchSize "gpu two stage" PseudoRandomType.XORWOW
estimatePi batches batchSize "cpu" PseudoRandomType.XORWOW
estimatePi batches batchSize "gpu compact" PseudoRandomType.MRG32K3A
estimatePi batches batchSize "gpu two stage" PseudoRandomType.MRG32K3A
estimatePi batches batchSize "cpu" PseudoRandomType.MRG32K3A
estimatePi batches batchSize "gpu compact" PseudoRandomType.PHILOX4_32_10
estimatePi batches batchSize "gpu two stage" PseudoRandomType.PHILOX4_32_10
estimatePi batches batchSize "cpu" PseudoRandomType.PHILOX4_32_10
]
let cpuTimes =
results
|> List.filter (fun (pi, t, l, rng, impl) -> impl = "cpu")
|> List.map (fun (pi, t, l, rng, impl) -> rng, t)
|> Map.ofList
type data = { RngType: string; Implementation: string; Value: float; Timing: string; Speedup: string }
results
|> List.map (fun (pi, t, l, rng, impl) ->
{
RngType = sprintf "%A" rng
Implementation = impl
Value = pi
Timing = sprintf "%.2f s" (t / 1000.0)
Speedup = sprintf "%.1f" ((cpuTimes |> Map.find rng) / t)
})
|> Util.Table
Out[24]:
In [25]:
let layout =
Layout(title = "CPU / GPU Performance",
xaxis=Xaxis(title = "Random number generators", showgrid = false, zeroline = false),
yaxis=Yaxis(title = "Speedup", showline = false), showlegend = true)
let grouped = results |> List.groupBy (fun (pi, t, l, rng, impl) -> impl)
let bars =
grouped |> List.map(fun (impl, s) ->
let x = s |> List.map (fun (pi, t, l, rng, impl) -> sprintf "%A" rng)
let y = s |> List.map (fun (pi, t, l, rng, impl) -> (cpuTimes |> Map.find rng) / t)
Bar(x = x, y = y, name = impl))
bars
|> Seq.toList
|> Chart.Plot
|> Chart.WithLayout layout
Out[25]:
In [ ]: