In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3 to calculate the empirical cumulative distribution function for a very large sample size. The purpose of this notebook is to demonstrate the new highly efficient GPU primitives that will come with our next release 3.1 of Alea GPU, such as highly optimized parallel sort, scan, reduce, copy if, stream compactification, merge, join, etc.
The empirical cumulative distribution function $\hat{F}_n(t)$ for the samples $x_1, \ldots, x_n$ is defined as
$${\hat F}_{n}(t)={\frac {{\mbox{number of elements in the sample}}\leq t}n}={\frac {1}{n}}\sum _{{i=1}}^{n}{\mathbf {1}}_{{x_{i}\leq t}}$$It is an estimator of the true distribution function from which the samples $x_1, \ldots, x_n$ are generated. More details can be found on Wikipedia.
Before you continue you should install the F# kernel for Jupyter.
We use Paket to get the newest version 3.1 beta NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.
Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.
In [1]:
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]
Out[1]:
In [2]:
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"
In [3]:
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly
In [4]:
open System
open Alea
open Alea.CSharp
open Alea.Parallel
let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
gpu
gpu.Device.Cores
((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
(IntPtr.Size * 8)
{ Html = s }
Out[4]:
To sample the empirical cumulative distribution function we use binary search. We code a binary search function that will be used on the CPU and the GPU, giving us full code reuse. In order to be able to compile it to run on the GPU, we add the [ReflectedDefinition]
attribute, so that the F# compiler generates the quotation for this function.
In [5]:
[<ReflectedDefinition>]
let inline binarySearch (n:int) (v:int -> 'T) (x:'T) =
let mutable l = 0
let mutable u = n - 1
while u - 1 > l do
let m = int((uint32(l) + uint32(u)) >>> 1)
if x < (v m) then u <- m
else l <- m
l
In [6]:
let ecdfCpu numSamples numPlotPoints (gen:float[] -> unit) =
let numbers = Array.zeroCreate<float> numSamples
let plotX = Array.zeroCreate<float> numPlotPoints
let plotY = Array.zeroCreate<float> numPlotPoints
gen numbers
// start measuring time
GC.Collect()
GC.WaitForPendingFinalizers()
let t = System.Diagnostics.Stopwatch.StartNew()
// first sort it
Array.sortInPlace numbers
// grab min max value
let min = numbers.[0]
let max = numbers.[numSamples - 1]
let dist = max - min
// binary search to create ecdf
for i = 0 to numPlotPoints - 1 do
let x = min + (dist / (float numPlotPoints)) * (float i)
let v i = numbers.[i]
let y = binarySearch numSamples v x
let y = (float y) / (float numSamples)
plotX.[i] <- x
plotY.[i] <- y
t.Stop()
// to get more accurate timings, we keep these array alive so we are not interrupted by GC collection
GC.KeepAlive(numbers)
GC.KeepAlive(plotX)
GC.KeepAlive(plotY)
plotX, plotY, t.Elapsed
In [7]:
let ecdfGpu numSamples numPlotPoints (gen:Session -> float[] -> unit) =
// create a GPU session, which manages the temporary memory required by the different algrotihms.
use session = new Session(gpu)
let numbers = session.Allocate<float>(numSamples)
let minmax = session.Allocate<float>(2)
let plotX = session.Allocate<float>(numPlotPoints)
let plotY = session.Allocate<float>(numPlotPoints)
gen session numbers
// start measuring time, we synchonize gpu first
session.Gpu.Synchronize()
GC.Collect()
GC.WaitForPendingFinalizers()
let t = System.Diagnostics.Stopwatch.StartNew()
// first sort it
session.Sort(numbers, numbers, (fun a b -> a < b))
// grab min max value
session.Gpu.Launch((fun () ->
minmax.[0] <- numbers.[0]
minmax.[1] <- numbers.[numSamples - 1]),
LaunchParam(1, 1))
session.For(0, numPlotPoints, (fun i ->
let min = minmax.[0]
let max = minmax.[1]
let dist = max - min
let x = min + (dist / (float numPlotPoints)) * (float i)
let v i = numbers.[i]
let y = binarySearch numSamples v x
let y = (float y) / (float numSamples)
plotX.[i] <- x
plotY.[i] <- y
))
// synchornize gpu then stop timer
gpu.Synchronize()
t.Stop()
// to get more accurate timings, we keep these array alive so we are not interrupted by GC collection
GC.KeepAlive(numbers)
GC.KeepAlive(minmax)
GC.KeepAlive(plotX)
GC.KeepAlive(plotY)
Gpu.CopyToHost(plotX), Gpu.CopyToHost(plotY), t.Elapsed
In [8]:
let genGpuUniform (session:Session) (numbers:float[]) =
session.UsingPseudoRandom(seed=42UL)
session.RandomUniform(numbers)
let genGpuNormal (session:Session) (numbers:float[]) =
session.UsingPseudoRandom(seed=42UL)
session.RandomNormal(numbers, 0.0, 1.0)
let genGpuLogNormal (session:Session) (numbers:float[]) =
session.UsingPseudoRandom(seed=42UL)
session.RandomLogNormal(numbers, 0.0, 1.0)
let genCpuUniform (data:float[]) =
use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT)
rng.SetPseudoRandomGeneratorSeed(42UL)
rng.GenerateUniform(data)
let genCpuNormal (data:float[]) =
use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT)
rng.SetPseudoRandomGeneratorSeed(42UL)
rng.GenerateNormal(data, 0.0, 1.0)
let genCpuLogNormal (data:float[]) =
use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT)
rng.SetPseudoRandomGeneratorSeed(42UL)
rng.GenerateLogNormal(data, 0.0, 1.0)
let layout title x y=
Layout(title = title,
xaxis=Xaxis(title = x, showgrid = false, zeroline = false),
yaxis=Yaxis(title = y, showline = false),
showlegend = true)
Let's estimate the empirical cumulative distribution function on the CPU and GPU for the normal distribution. Because we are only interested in the visualization, we choose a moderate sample size. 1MB gives us already sufficient accuracy.
In [9]:
let numSamples = 1024*1024
let numPlotPoints = 1000
In [10]:
let x, y, _ = ecdfCpu numSamples numPlotPoints genCpuNormal
Scatter(name = "Normal", x = x, y = y, mode = "lines")
|> Chart.Plot
|> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on CPU" "x" "probability")
Out[10]:
In [11]:
let x, y, _ = ecdfGpu numSamples numPlotPoints genGpuNormal
Scatter(name = "Normal", x = x, y = y, mode = "lines")
|> Chart.Plot
|> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on GPU" "x" "probability")
Out[11]:
In [12]:
let x, y, _ = ecdfCpu numSamples numPlotPoints genCpuLogNormal
Scatter(name = "Log Normal", x = x, y = y, mode = "lines")
|> Chart.Plot
|> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on CPU" "x" "probability")
Out[12]:
In [13]:
let x, y, _ = ecdfGpu numSamples numPlotPoints genGpuLogNormal
Scatter(name = "Log Normal", x = x, y = y, mode = "lines")
|> Chart.Plot
|> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on GPU" "x" "probability")
Out[13]:
In [14]:
let megaBytes = if Gpu.Default.Device.Cores <= 512 then 10 else 100
let numSamples = 1024*1024*megaBytes
let _, _, cpuTime = ecdfCpu numSamples 1000 genCpuNormal
let _, _, gpuTime = ecdfGpu numSamples 1000 genGpuNormal
let speedup = cpuTime.TotalMilliseconds / gpuTime.TotalMilliseconds
type data = { Device: string; Timing: string; Speedup: float }
let records =
[|
{ Device = "CPU"; Timing = sprintf "%.2f ms" cpuTime.TotalMilliseconds; Speedup = 1.0}
{ Device = "GPU"; Timing = sprintf "%.2f ms" gpuTime.TotalMilliseconds; Speedup = speedup}
|]
records |> Util.Table
Out[14]:
In [15]:
let megaBytes = if Gpu.Default.Device.Cores <= 512 then [1..10] else [100..100..900]
let gpuPerformance =
seq {
for scale in megaBytes do
let numSamples = scale * 1024 * 1024
let _, _, time = ecdfGpu numSamples 1000 genGpuNormal
yield (scale, time.TotalMilliseconds)
} |> Seq.toList
type data = { MegaBytes: int; Timing: string; MegaBytesPerSec: string }
gpuPerformance
|> List.map (fun (s, t) ->
{
MegaBytes = s
Timing = sprintf "%.2f ms" t
MegaBytesPerSec = sprintf "%.2f MB/sec" ((float s) / t * 1000.0)
})
|> Util.Table
Out[15]:
In [16]:
let layout =
Layout(title = "Timing in ms",
xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false),
yaxis=Yaxis(title = "Timing (ms)", showline = false))
Bar(x = (gpuPerformance |> List.map (fun (mb, t) -> sprintf "%d MB" mb)),
y = (gpuPerformance |> List.map (fun (mb, t) -> t)))
|> Chart.Plot
|> Chart.WithLayout layout
Out[16]:
In [17]:
let layout =
Layout(title = "Performance in MB per second",
xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false),
yaxis=Yaxis(title = "MB per second", showline = false))
Bar(x = (gpuPerformance |> List.map (fun (s, t) -> sprintf "%d MB" s)),
y = (gpuPerformance |> List.map (fun (s, t) -> (float s) / t * 1000.0)))
|> Chart.Plot
|> Chart.WithLayout layout
Out[17]:
In [18]:
let megaBytes = if Gpu.Default.Device.Cores <= 512 then [1..10] else [100..100..900]
let performanceCpuGpu numSamples numPlotTimes =
let _, _, cpuTime = ecdfCpu numSamples numPlotTimes genCpuNormal
let _, _, gpuTime = ecdfGpu numSamples numPlotTimes genGpuNormal
let cpuTime = cpuTime.TotalMilliseconds
let gpuTime = gpuTime.TotalMilliseconds
let speedup = cpuTime / gpuTime
cpuTime, gpuTime, speedup
let gpuCpuPerformance =
seq {
for scale in megaBytes do
let numSamples = scale * 1024 * 1024
let cpuTime, gpuTime, speedup = performanceCpuGpu numSamples 1000
yield (scale, cpuTime, gpuTime, speedup)
} |> Seq.toList
type data = { MegaBytes: int; GPUTiming: string; CPUTiming: string; Speedup: string }
gpuCpuPerformance |> List.map (fun (s, ct, gt, speedup) ->
{
MegaBytes = s
GPUTiming = sprintf "%.2f ms" gt
CPUTiming = sprintf "%.2f ms" ct
Speedup = sprintf "%.2f" speedup
}) |> Util.Table
Out[18]:
In [19]:
let layout =
Layout(title = "CPU / GPU Performance in MB per second",
xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false),
yaxis=Yaxis(title = "MB per second", showline = false), showlegend = true)
seq {
let x = gpuCpuPerformance |> List.map (fun (s, _, _, _) -> sprintf "%d MB" s)
let yCpu = gpuCpuPerformance |> List.map (fun (s, t, _, _) -> (float s) / t * 1000.0)
let yGpu = gpuCpuPerformance |> List.map (fun (s, _, t, _) -> (float s) / t * 1000.0)
yield Bar(x = x, y = yCpu, name = "CPU")
yield Bar(x = x, y = yGpu, name = "GPU")
}
|> Seq.toList
|> Chart.Plot
|> Chart.WithLayout layout
Out[19]:
In [ ]: