Simple sampling problem

Sampling discrete distributions

First we need to generate a discrete distributions of $n_{item} = 5$ elements, each weights is generated from a random uniform distribution. We will add that this distribution will be normalized.


In [1]:
# Some useful package
import random                    
import numpy                     
import math                       
import time                      
import operator                   
import pylab    
import matplotlib.pyplot as plt
%matplotlib inline

n_item = 5

#First of all : generate the random distribution
Normalization = 0
Proba = []
for i in range(n_item):
    Proba.append(random.uniform(0,1))
    Normalization=Normalization+Proba[i]

# Now we normalize the distribution
for i in range(n_item):
    Proba[i]=Proba[i]/Normalization      

#Some graphical informations
print ("i   Prob(i)   ")
for i in range(n_item):
    print (i, ' ',Proba[i])


i   Prob(i)   
0   0.3895127374151168
1   0.2589629569341132
2   0.17811303769720963
3   0.0421764786661813
4   0.13123478928737917

Now that we have define our distribution, we can write the "accept/reject" procedure. The goal is to sample from the distribution.


In [2]:
def sample_AR(proba,valeu_max,N):
    # N will be n_item
    ok=False
    xr=0
    while ok==False :
        # Definition of the several squares
        x = random.uniform(0,N)
        y = random.uniform(0,valeu_max)
        xr = int(math.floor(x))
        if Proba[xr]>y :
            ok=True   
    
    return xr

# Computing the max value with help of operator module
index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1)) 


list_proba =[]
for i in range(10000):
    list_proba.append(sample_AR(Proba,valeur_max,n_item))
       
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()


We can note that the histogram fits very well the theoritical distributions. This can be a sort of proof that our AR procedure is working.

Now we want to implement another kind of procedure : the Tower Sampling. First of all we will need to compute : $P(x < i)$, the cumulative function.


In [3]:
cumu = [0.0] 
for l in range(n_item):
    cumu.append(cumu[l] + Proba[l])    
    
    
#Some graphical illustrations

print ("i   Prob(i)               Prob<i")
for i in range(n_item):
    print (i, ' ',Proba[i], ' ', cumu[i])


i   Prob(i)               Prob<i
0   0.3895127374151168   0.0
1   0.2589629569341132   0.3895127374151168
2   0.17811303769720963   0.64847569434923
3   0.0421764786661813   0.8265887320464397
4   0.13123478928737917   0.868765210712621

Now we can code the TS procedure.


In [4]:
#Procedure in order to achieve the research of the corresponding i
def research(x, cumulative):
    kmin = 0
    kmax = len(cumulative)
    while True:
        k = int((kmin + kmax) / 2)
        if cumulative[k] < x:
            kmin = k
        elif cumulative[k - 1] > x:
            kmax = k
        else:
            return k - 1
        
        
        
def sample_TS(cumulative):
    x = random.random() 
    sample = research(x, cumulative)
    return sample

list_proba2 =[]
for i in range(10000):
    list_proba2.append(sample_TS(cumu))

#Graphical stuff
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba2, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()


Like with the AR procedure, the sampling seems to work fine. We want now to compute and compare the time used by the two procedures.


In [35]:
n_item = numpy.arange(1,500,5)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))


for i,N in enumerate(n_item):
    
    Normalization = 0
    Proba = []
    for j in range(N):
       Proba.append(random.uniform(0,1))
       Normalization=Normalization+Proba[j]

# Now we normalize the distribution
    for j in range(N):
       Proba[j]=Proba[j]/Normalization      
    
    
    cumu = [0.0] 
    for l in range(N):
        cumu.append(cumu[l] + Proba[l])   
    
    # Computing the max value with help of operator module
    index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1)) 
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_AR(Proba,valeur_max,N))
    end = time.time()
    time_ar[i] =(end - start)
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_TS(cumu))
    end = time.time()
    time_ts[i] = (end - start)
      

plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()


We can note that in the case of AR procedure, the time used seems to be a linear function of $n_{item}$. However for the TS procedure it seems to remain constant. For small values of $n_{item}$, the two procedures are equivalent but for large values it's clearly better to chosse the TS procedure.

Why do we have this difference ?

If $n_{item}$ is increasing, the rejection area in the procedure AR is increasing too ( léger shcéma à mettre). That's why we have an increase of time needed. For the procedure TS, we don't have this rejection area, because the box shape a tower, thus there is no reason to have suche a strong increase with $n_{item}$.

The goal now is to observe the several behavior of $T_{used}$ for the AR procedure if we change the initial distribution.


In [ ]:
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()

In [ ]:
# Exponential distribution

n_item = numpy.arange(1,2000,50)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))
time_ar1= numpy.zeros(len(n_item))

for i,N in enumerate(n_item):
    
    Normalization = 0
    Proba = []
    for j in range(N):
       Proba.append(-numpy.log(random.uniform(0,1)))
       Normalization=Normalization+Proba[j]

# Now we normalize the distribution
    for j in range(N):
       Proba[j]=Proba[j]/Normalization      
    
    
    cumu = [0.0] 
    for l in range(N):
        cumu.append(cumu[l] + Proba[l])   
    
    # Computing the max value with help of operator module
    index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1)) 
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_AR(Proba,valeur_max,N))
    end = time.time()
    time_ar[i] =(end - start)
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_TS(cumu))
    end = time.time()
    time_ts[i] = (end - start)
      
for i,N in enumerate(n_item):
    
    Normalization = 0
    Proba2 = []
    for j in range(N):
       Proba2.append(random.uniform(0,1))
       Normalization=Normalization+Proba[j]

# Now we normalize the distribution
    for j in range(N):
       Proba2[j]=Proba[j]/Normalization      
    
    
       
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_AR(Proba2,valeur_max,N))
    end = time.time()
    time_ar1[i] =(end - start)
    
    
    
plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.plot(n_item,time_ar1,'go',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()

In [ ]:
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()

In [28]:
# -1/2 distribution

n_item = numpy.arange(1,100,5)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))
time_ar1= numpy.zeros(len(n_item))

for i,N in enumerate(n_item):
    
    Normalization = 0
    Proba = []
    for j in range(N):
       Proba.append((random.uniform(0,1))**(-1/2))
       Normalization=Normalization+Proba[j]

# Now we normalize the distribution
    for j in range(N):
       Proba[j]=Proba[j]/Normalization      
    
    
    cumu = [0.0] 
    for l in range(N):
        cumu.append(cumu[l] + Proba[l])   
    
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_AR(Proba,valeur_max,N))
    end = time.time()
    time_ar[i] =(end - start)
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_TS(cumu))
    end = time.time()
    time_ts[i] = (end - start)
      
for i,N in enumerate(n_item):
    
    Normalization = 0
    Proba = []
    for j in range(N):
       Proba.append(random.uniform(0,1))
       Normalization=Normalization+Proba[j]

# Now we normalize the distribution
    for j in range(N):
       Proba[j]=Proba[j]/Normalization      
    
    
    cumu = [0.0] 
    for l in range(N):
        cumu.append(cumu[l] + Proba[l])   
    
    
    start = time.time()
    list_proba =[]
    for j in range(10000):
        list_proba.append(sample_AR(Proba,valeur_max,N))
    end = time.time()
    time_ar1[i] =(end - start)
    
    
    
plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.plot(n_item,time_ar1,'go',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()



In [ ]: