In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom
In [51]:
'''Computes the probability mass function from the cumulative density'''
def cdf_to_pmf(X):
return np.concatenate(([X[0]], np.diff(X)))
def pmf_to_cdf(X):
return X.cumsum()
'''Computes the cumulative density function of min{X,Y}'''
def cdf_min(X,Y):
return 1 - (1-X)*(1-Y)
'''Returns the pmf of the sum of two pmfs'''
def sum_cdf(X,Y):
X_pmf = cdf_to_pmf(X)
Y_pmf = cdf_to_pmf(Y)
return np.convolve(X_pmf,Y_pmf).cumsum()
def sum_pmf(X,Y):
return np.convolve(X,Y)
'''Constructs the cdf for the shift distance t with paramter n and p'''
def shift_distance_cdf(t, n, p):
S = binom.cdf(range(n), n-1, p)
S = np.concatenate(([0]*t*2, S))
return S
'''Computes the cdf of the minimum of several cdfs'''
def cdf_min_a(M):
m = M.shape[0]
X = M[0,:]
for i in range(1,m):
X = cdf_min(X, M[i,:])
return X
'''Computes the distribution of the shift distance for strings with length n'''
def sd_cdf(n, p):
Ks = range(n+1)
t_max = int(np.floor(n/2))
# start with the distribution of t=0, that is, the Hamming distance
SD_ts = np.array([binom.cdf(Ks, n, p)])
for t in range(1,t_max+1):
cdf_sdt = np.pad(binom.cdf(Ks, n-t, p), [2*t,0], 'constant', constant_values=[0])[:-2*t]
SD_ts = np.vstack([SD_ts, cdf_sdt])
return cdf_min_a(SD_ts)
def recursive_sd_cdf(n, p):
if (n <= 1):
return (np.array([1-p,1]))
if (n == 2):
return (np.array([(1-p)**2, 1-p*p, 1]))
else:
sd_n = sd_cdf(n,3/4)
n_2 = int(np.floor(n/2))
cdf_n_2 = recursive_sd_cdf(n_2,p)
cdf_rec = sum_cdf(cdf_n_2, cdf_n_2)
cdf = cdf_min(cdf_rec, sd_n)
return cdf
In [3]:
for k in range(11):
n = np.power(2,k)
Ks = np.arange(n+1)
cdf = recursive_sd_cdf(n,3/4)
print("{0}\t{1}".format(n, np.sum(Ks*cdf_to_pmf(cdf))/n))
In [4]:
for n in range(2,128,2):
SD_n = sd_cdf(n,3/4)
SD_n_2 = sd_cdf(int(n/2), 3/4)
SD_rec = sum_cdf(SD_n_2, SD_n_2)
cdf = cdf_min(SD_n, SD_rec)
tau_n = np.sum(cdf_to_pmf(cdf)*np.arange(n+1)) / n
print("{0}\t{1}".format(n,tau_n))
In [5]:
p = 3/4
ns = [16]
for n in ns:
Ks = range(n+1)
X = binom.cdf(range(n+1), n, p)
Z = cdf_min(X,X)
X_pmf = cdf_to_pmf(X)
Z_pmf = cdf_to_pmf(Z)
print("E[X] = {0}".format(np.sum(X_pmf*Ks)))
print("E[Z] = {0}".format(np.sum(Z_pmf*Ks)))
plt.plot(Ks, X_pmf, label='X')
plt.plot(Ks, Z_pmf, label='min{X,X}')
plt.legend()
In [6]:
ns = range(16,4099,32)
As = []
for n in ns:
Ks = range(n+1)
SDs = binom.cdf(Ks, n, p)
ts = [1,2]
for t in ts:
St = shift_distance_cdf(t, n, p)
St = St[:n+1]
Z = cdf_min(St, St)
SDs = np.vstack((SDs, Z))
St_pmf = cdf_to_pmf(St)
Z_pmf = cdf_to_pmf(Z)
SD_cdf = cdf_min_a(SDs)
SD_pmf = cdf_to_pmf(SD_cdf)
#print("n = {0}\n E[SD] = {1:.2f}\n E[X] = {2}".format(n, np.sum(Ks*SD_pmf), n*p))
a_n = np.sum(Ks*SD_pmf)/n
As.append(a_n)
#print(" alpha_n = {0}".format(a_n))
plt.plot(ns,As)
Out[6]:
In [94]:
## Matrix of distributions
n = 8
p = 3/4
M = np.zeros([n+1,n+1,n+1])
# on celles (0,i) and (i,0) the content is deterministc
for i in range(n+1):
M[0,i,i] = 1
M[i,0,i] = 1
# it is convinient to have the distributions [1-p,p] and [0, 1] once for all
delta_d = np.zeros(n+1)
delta_d[0] = 1-p
delta_d[1] = p
one_d = np.zeros(n+1)
one_d[1] = 1
# M[i,j,:] represents the distribution on cell (i,j)
for i in range(1,n+1):
for j in range(1,n+1):
# form the distribution of the 3 cells
diag = pmf_to_cdf(sum_pmf(M[i-1,j-1,:], delta_d)[:n+1])
horiz = pmf_to_cdf(sum_pmf(M[i-1,j,:], one_d)[:n+1])
vert = pmf_to_cdf(sum_pmf(M[i,j-1,:], one_d)[:n+1])
min_hv = cdf_min(horiz, vert)
M[i,j,:] = cdf_to_pmf(cdf_min(diag, min_hv))
#M[i,j,:] = cdf_to_pmf(cdf_min(M[i-1,j-1,:], diag))
In [97]:
for i in range(n+1):
dist = M[i,i,:]
print("{0}\t{1:.6f}".format(i, np.sum(dist*np.arange(n+1))/i))
M_Mean = np.zeros([n+1,n+1])
for i in range(n+1):
for j in range(n+1):
M_Mean[i,j] = np.sum(M[i,j,:]*np.arange(n+1))
M_Mean
Out[97]:
In [98]:
np.savetxt("/tmp/dist_exp.txt", M_Mean, fmt="%.4f")
In [ ]:
In [ ]: