First we need to generate a discrete distributions of $n_{item} = 5$ elements, each weights is generated from a random uniform distribution. We will add that this distribution will be normalized.
In [1]:
# Some useful package
import random
import numpy
import math
import time
import operator
import pylab
import matplotlib.pyplot as plt
%matplotlib inline
n_item = 5
#First of all : generate the random distribution
Normalization = 0
Proba = []
for i in range(n_item):
Proba.append(random.uniform(0,1))
Normalization=Normalization+Proba[i]
# Now we normalize the distribution
for i in range(n_item):
Proba[i]=Proba[i]/Normalization
#Some graphical informations
print ("i Prob(i) ")
for i in range(n_item):
print (i, ' ',Proba[i])
Now that we have define our distribution, we can write the "accept/reject" procedure. The goal is to sample from the distribution.
In [2]:
def sample_AR(proba,valeu_max,N):
# N will be n_item
ok=False
xr=0
while ok==False :
# Definition of the several squares
x = random.uniform(0,N)
y = random.uniform(0,valeu_max)
xr = int(math.floor(x))
if Proba[xr]>y :
ok=True
return xr
# Computing the max value with help of operator module
index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1))
list_proba =[]
for i in range(10000):
list_proba.append(sample_AR(Proba,valeur_max,n_item))
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()
We can note that the histogram fits very well the theoritical distributions. This can be a sort of proof that our AR procedure is working.
Now we want to implement another kind of procedure : the Tower Sampling. First of all we will need to compute : $P(x < i)$, the cumulative function.
In [3]:
cumu = [0.0]
for l in range(n_item):
cumu.append(cumu[l] + Proba[l])
#Some graphical illustrations
print ("i Prob(i) Prob<i")
for i in range(n_item):
print (i, ' ',Proba[i], ' ', cumu[i])
Now we can code the TS procedure.
In [4]:
#Procedure in order to achieve the research of the corresponding i
def research(x, cumulative):
kmin = 0
kmax = len(cumulative)
while True:
k = int((kmin + kmax) / 2)
if cumulative[k] < x:
kmin = k
elif cumulative[k - 1] > x:
kmax = k
else:
return k - 1
def sample_TS(cumulative):
x = random.random()
sample = research(x, cumulative)
return sample
list_proba2 =[]
for i in range(10000):
list_proba2.append(sample_TS(cumu))
#Graphical stuff
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba2, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()
Like with the AR procedure, the sampling seems to work fine. We want now to compute and compare the time used by the two procedures.
In [35]:
n_item = numpy.arange(1,500,5)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))
for i,N in enumerate(n_item):
Normalization = 0
Proba = []
for j in range(N):
Proba.append(random.uniform(0,1))
Normalization=Normalization+Proba[j]
# Now we normalize the distribution
for j in range(N):
Proba[j]=Proba[j]/Normalization
cumu = [0.0]
for l in range(N):
cumu.append(cumu[l] + Proba[l])
# Computing the max value with help of operator module
index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1))
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_AR(Proba,valeur_max,N))
end = time.time()
time_ar[i] =(end - start)
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_TS(cumu))
end = time.time()
time_ts[i] = (end - start)
plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()
We can note that in the case of AR procedure, the time used seems to be a linear function of $n_{item}$. However for the TS procedure it seems to remain constant. For small values of $n_{item}$, the two procedures are equivalent but for large values it's clearly better to chosse the TS procedure.
If $n_{item}$ is increasing, the rejection area in the procedure AR is increasing too ( léger shcéma à mettre). That's why we have an increase of time needed. For the procedure TS, we don't have this rejection area, because the box shape a tower, thus there is no reason to have suche a strong increase with $n_{item}$.
The goal now is to observe the several behavior of $T_{used}$ for the AR procedure if we change the initial distribution.
In [ ]:
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()
In [ ]:
# Exponential distribution
n_item = numpy.arange(1,2000,50)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))
time_ar1= numpy.zeros(len(n_item))
for i,N in enumerate(n_item):
Normalization = 0
Proba = []
for j in range(N):
Proba.append(-numpy.log(random.uniform(0,1)))
Normalization=Normalization+Proba[j]
# Now we normalize the distribution
for j in range(N):
Proba[j]=Proba[j]/Normalization
cumu = [0.0]
for l in range(N):
cumu.append(cumu[l] + Proba[l])
# Computing the max value with help of operator module
index, valeur_max = max(enumerate(Proba), key=operator.itemgetter(1))
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_AR(Proba,valeur_max,N))
end = time.time()
time_ar[i] =(end - start)
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_TS(cumu))
end = time.time()
time_ts[i] = (end - start)
for i,N in enumerate(n_item):
Normalization = 0
Proba2 = []
for j in range(N):
Proba2.append(random.uniform(0,1))
Normalization=Normalization+Proba[j]
# Now we normalize the distribution
for j in range(N):
Proba2[j]=Proba[j]/Normalization
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_AR(Proba2,valeur_max,N))
end = time.time()
time_ar1[i] =(end - start)
plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.plot(n_item,time_ar1,'go',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()
In [ ]:
# Ploting the histogram and theoritical proportions
x = [a for a in range(n_item)]
y = [Proba[a] for a in range(n_item)]
pylab.figure()
pylab.hist(list_proba, bins=n_item, range=(-0.5, n_item-0.5), normed=True)
pylab.plot(x, y,'ro', ms=10)
pylab.title("Histogram (sampling) vs Distribution (dots)")
pylab.xlabel('$i$',fontsize=20)
pylab.ylabel('$\Proba(i)$',fontsize=20)
pylab.show()
In [28]:
# -1/2 distribution
n_item = numpy.arange(1,100,5)
time_ar = numpy.zeros(len(n_item))
time_ts = numpy.zeros(len(n_item))
time_ar1= numpy.zeros(len(n_item))
for i,N in enumerate(n_item):
Normalization = 0
Proba = []
for j in range(N):
Proba.append((random.uniform(0,1))**(-1/2))
Normalization=Normalization+Proba[j]
# Now we normalize the distribution
for j in range(N):
Proba[j]=Proba[j]/Normalization
cumu = [0.0]
for l in range(N):
cumu.append(cumu[l] + Proba[l])
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_AR(Proba,valeur_max,N))
end = time.time()
time_ar[i] =(end - start)
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_TS(cumu))
end = time.time()
time_ts[i] = (end - start)
for i,N in enumerate(n_item):
Normalization = 0
Proba = []
for j in range(N):
Proba.append(random.uniform(0,1))
Normalization=Normalization+Proba[j]
# Now we normalize the distribution
for j in range(N):
Proba[j]=Proba[j]/Normalization
cumu = [0.0]
for l in range(N):
cumu.append(cumu[l] + Proba[l])
start = time.time()
list_proba =[]
for j in range(10000):
list_proba.append(sample_AR(Proba,valeur_max,N))
end = time.time()
time_ar1[i] =(end - start)
plt.plot(n_item,time_ar,'bo',label='Time AR')
plt.plot(n_item,time_ar1,'go',label='Time AR')
plt.xlabel('$n_{item}$',fontsize=20)
plt.ylabel('Time used',fontsize=20)
plt.plot(n_item,time_ts,'r+',label='Time TS')
plt.legend(loc=2)
plt.title('Numerically estimate of the time used by the procedure')
plt.show()
In [ ]: