A comparison of methods of random choice

random.choice vs. a random.multinomial based implementation of the same weighted choice. Also compare with a GNU Scientific Library based implementation.

Context: random.choice is only available in numpy >= 1.7, so I was trying to find a simple substitute for machines running older numpy versions.


In [26]:
import numpy as np
%load_ext Cython


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython

GSL based multinomial called using CythonGSL wrapper


In [27]:
%%cython -l gsl

cimport cython
from cython_gsl cimport *

import numpy as np
from numpy cimport *

cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)

def multinomial(ndarray[double, ndim=1] p, unsigned int N):
    cdef:
       size_t K = p.shape[0]
       ndarray[uint32_t, ndim=1] n = np.empty_like(p, dtype='uint32')
    
    # void gsl_ran_multinomial (const gsl_rng * r, size_t K, unsigned int N, const double p[], unsigned int n[])
    gsl_ran_multinomial(r, K, N, <double*> p.data, <unsigned int *> n.data)
    
    return n

GSL based multinomial called directly


In [30]:
%%cython -l gsl

cimport cython

import numpy as np
from numpy cimport *

cdef extern from "gsl/gsl_rng.h":
    ctypedef struct gsl_rng_type
    ctypedef struct gsl_rng
    
    cdef gsl_rng_type *gsl_rng_mt19937
    
    gsl_rng *gsl_rng_alloc ( gsl_rng_type * T) nogil

cdef extern from "gsl/gsl_randist.h":
    void gsl_ran_multinomial ( gsl_rng * r,  size_t K,
                               unsigned int N,  double p[],
                               unsigned int n[] ) nogil
    void gsl_rng_set (const gsl_rng * r, unsigned long int s) nogil

cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)

def seed_directgsl(unsigned long int seed):
    gsl_rng_set(r, seed)

def multinomial_directgsl(ndarray[double, ndim=1] p, unsigned int N):
    cdef:
       size_t K = p.shape[0]
       ndarray[uint32_t, ndim=1] n = np.empty_like(p, dtype='uint32')
    
    # void gsl_ran_multinomial (const gsl_rng * r, size_t K, unsigned int N, const double p[], unsigned int n[])
    gsl_ran_multinomial(r, K, N, <double*> p.data, <unsigned int *> n.data)
    
    return n

In [69]:
def choice(p):
    n = np.random.random(p.shape)
    pcum = p.cumsum()
    return pcum.searchsorted(n)

Test equivalence of results


In [8]:
p = np.array([0.5, 0.3, 0.2])
prng = np.random.RandomState(3)
print prng.choice(3, size = 3, p = p)
print np.repeat(np.arange(3), prng.multinomial(3, p))
print np.repeat(np.arange(3), multinomial(p, 3))
print np.repeat(np.arange(3), multinomial_directgsl(p, 3))


[1 1 0]
[0 0 2]
[0 0 0]
[0 0 0]

In [65]:
N = 100000
print np.bincount(np.random.choice(3, size = 3 * N, p = p))/(3.0 * N)
print np.bincount(np.asarray([np.repeat(np.arange(3), np.random.multinomial(3, p)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([np.repeat(np.arange(3), multinomial(p, 3)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([np.repeat(np.arange(3), multinomial_directgsl(p, 3)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([choice(p) for i in range(N)]).flatten())/(3.0 *N)


[ 0.50114667  0.29833     0.20052333]
[ 0.49891     0.29993667  0.20115333]
[ 0.50167333  0.29891     0.19941667]
[ 0.50043333  0.29975333  0.19981333]
[ 0.50031333  0.30023333  0.19945333]

Conclusion: All methods are statistically equivalent. They do not give the same results for the same random seed, though.

Time execution times


In [70]:
p = np.array([0.5, 0.3, 0.2])
%timeit -n 10000 prng.choice(3, size = 3, p = p)
%timeit -n 10000 np.repeat(np.arange(3), prng.multinomial(3, p))
%timeit -n 10000 np.repeat(np.arange(3), multinomial(p, 3))
%timeit -n 10000 np.repeat(np.arange(3), multinomial_directgsl(p, 3))
%timeit -n 10000 choice(p)


10000 loops, best of 3: 56.7 µs per loop
10000 loops, best of 3: 1.87 µs per loop
10000 loops, best of 3: 2.97 µs per loop
10000 loops, best of 3: 2.97 µs per loop
10000 loops, best of 3: 2.07 µs per loop

The multinomial based method is (surprisingly?) an order of magnitude faster. This is probably fixed in the bleeding edge version of numpy (see https://github.com/numpy/numpy/issues/4188).


In [71]:
N = 10000
p = np.random.rand(N)
p /= np.sum(p)
N_arange = np.arange(N)
%timeit -n 100 prng.choice(N, size = N, p = p)
%timeit -n 100 np.repeat(N_arange, prng.multinomial(N, p))
%timeit -n 100 np.repeat(N_arange, multinomial(p, N))
%timeit -n 100 np.repeat(N_arange, multinomial_directgsl(p, N))
%timeit -n 100 choice(p)


100 loops, best of 3: 1.34 ms per loop
100 loops, best of 3: 1.04 ms per loop
100 loops, best of 3: 699 µs per loop
100 loops, best of 3: 694 µs per loop
100 loops, best of 3: 1.23 ms per loop

For large N the gsl multinomial function is significantly faster than using np.random

Seeding of gsl multinomial generator


In [51]:
p = np.array([0.5, 0.3, 0.2])
print(multinomial_directgsl(p, 3))
print(multinomial_directgsl(p, 3))

seed_directgsl(10)
print(multinomial_directgsl(p, 3))
seed_directgsl(10)
print(multinomial_directgsl(p, 3))


[0 2 1]
[1 1 1]
[2 1 0]
[2 1 0]