In [13]:
from __future__ import division
import numpy as np
def simulate_genome(length, alpha):
read_arr = np.random.choice([0,1], length, p=[1-alpha, alpha])
reads = ''
for x in read_arr:
reads +=str(x)
return reads
def countoccurences(word, read):
return read.count(word)
NN = 100000
L = 100
alpha = 0.5
max_iterations = 1000
word = '101'
nn = 100000
c = NN*L/nn
counts = {}
#for n in range(1, nn):
counts[nn] = []
for i in range(1, max_iterations):
genome = simulate_genome(nn, alpha)
read_start_positions = np.random.choice(nn-L-1, NN)
occurences = 0
for p in read_start_positions:
read = genome[p:p+L]
occurences += countoccurences(word, read)
counts[nn].append(occurences)
In [20]:
a = alpha
b = 1-alpha
var = c**2 * ((a**2)*b *(a+b-3*(a**2)*b+2*a*(b**2)+2*a*b+2*b**2))/(nn*(a+b)**2)
In [25]:
print 'Yn/n\t Simulated:{} \t Analytical:{}'.format(np.mean(counts[nn])/nn,c*alpha**2*(1-alpha))
In [26]:
print 'Var(Yn)/n\t Simulated={}\t Analytical:{}'.format(np.var(counts[nn])/nn**2, var)
Thus, as seen from above, our analytical and simulated estimate of $\frac{Y_n}{n}$ are 12.83 and 9.83 respectively. I used 1000 iterations, so the bound can improve by increasing the iterations. Similarly the estimate for $\frac{Var(Y_n)}{n}$ is pretty close (simulated: 0.006, analytical: 0.02)