random.choice vs. a random.multinomial based implementation of the same weighted choice. Also compare with a GNU Scientific Library based implementation.
Context: random.choice is only available in numpy >= 1.7, so I was trying to find a simple substitute for machines running older numpy versions.
In [26]:
import numpy as np
%load_ext Cython
In [27]:
%%cython -l gsl
cimport cython
from cython_gsl cimport *
import numpy as np
from numpy cimport *
cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)
def multinomial(ndarray[double, ndim=1] p, unsigned int N):
cdef:
size_t K = p.shape[0]
ndarray[uint32_t, ndim=1] n = np.empty_like(p, dtype='uint32')
# void gsl_ran_multinomial (const gsl_rng * r, size_t K, unsigned int N, const double p[], unsigned int n[])
gsl_ran_multinomial(r, K, N, <double*> p.data, <unsigned int *> n.data)
return n
In [30]:
%%cython -l gsl
cimport cython
import numpy as np
from numpy cimport *
cdef extern from "gsl/gsl_rng.h":
ctypedef struct gsl_rng_type
ctypedef struct gsl_rng
cdef gsl_rng_type *gsl_rng_mt19937
gsl_rng *gsl_rng_alloc ( gsl_rng_type * T) nogil
cdef extern from "gsl/gsl_randist.h":
void gsl_ran_multinomial ( gsl_rng * r, size_t K,
unsigned int N, double p[],
unsigned int n[] ) nogil
void gsl_rng_set (const gsl_rng * r, unsigned long int s) nogil
cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)
def seed_directgsl(unsigned long int seed):
gsl_rng_set(r, seed)
def multinomial_directgsl(ndarray[double, ndim=1] p, unsigned int N):
cdef:
size_t K = p.shape[0]
ndarray[uint32_t, ndim=1] n = np.empty_like(p, dtype='uint32')
# void gsl_ran_multinomial (const gsl_rng * r, size_t K, unsigned int N, const double p[], unsigned int n[])
gsl_ran_multinomial(r, K, N, <double*> p.data, <unsigned int *> n.data)
return n
In [69]:
def choice(p):
n = np.random.random(p.shape)
pcum = p.cumsum()
return pcum.searchsorted(n)
In [8]:
p = np.array([0.5, 0.3, 0.2])
prng = np.random.RandomState(3)
print prng.choice(3, size = 3, p = p)
print np.repeat(np.arange(3), prng.multinomial(3, p))
print np.repeat(np.arange(3), multinomial(p, 3))
print np.repeat(np.arange(3), multinomial_directgsl(p, 3))
In [65]:
N = 100000
print np.bincount(np.random.choice(3, size = 3 * N, p = p))/(3.0 * N)
print np.bincount(np.asarray([np.repeat(np.arange(3), np.random.multinomial(3, p)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([np.repeat(np.arange(3), multinomial(p, 3)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([np.repeat(np.arange(3), multinomial_directgsl(p, 3)) for i in range(N)]).flatten())/(3.0 *N)
print np.bincount(np.asarray([choice(p) for i in range(N)]).flatten())/(3.0 *N)
Conclusion: All methods are statistically equivalent. They do not give the same results for the same random seed, though.
In [70]:
p = np.array([0.5, 0.3, 0.2])
%timeit -n 10000 prng.choice(3, size = 3, p = p)
%timeit -n 10000 np.repeat(np.arange(3), prng.multinomial(3, p))
%timeit -n 10000 np.repeat(np.arange(3), multinomial(p, 3))
%timeit -n 10000 np.repeat(np.arange(3), multinomial_directgsl(p, 3))
%timeit -n 10000 choice(p)
The multinomial based method is (surprisingly?) an order of magnitude faster. This is probably fixed in the bleeding edge version of numpy (see https://github.com/numpy/numpy/issues/4188).
In [71]:
N = 10000
p = np.random.rand(N)
p /= np.sum(p)
N_arange = np.arange(N)
%timeit -n 100 prng.choice(N, size = N, p = p)
%timeit -n 100 np.repeat(N_arange, prng.multinomial(N, p))
%timeit -n 100 np.repeat(N_arange, multinomial(p, N))
%timeit -n 100 np.repeat(N_arange, multinomial_directgsl(p, N))
%timeit -n 100 choice(p)
For large N the gsl multinomial function is significantly faster than using np.random
In [51]:
p = np.array([0.5, 0.3, 0.2])
print(multinomial_directgsl(p, 3))
print(multinomial_directgsl(p, 3))
seed_directgsl(10)
print(multinomial_directgsl(p, 3))
seed_directgsl(10)
print(multinomial_directgsl(p, 3))