In [2]:
%matplotlib inline
import numpy as np
import pandas as p
import matplotlib.pyplot as plt
import seaborn as sns
import math
import itertools
import random
sns.set_style("white")
sns.set(style="ticks")
from matplotlib import rc
rc('text', usetex=True)
sns.set_color_codes()
In [8]:
import operator as op
def ncr(n, r):
if r > n: return 0
# print n, r, n-r
r = min(r, n-r)
if r == 0: return 1
numer = reduce(op.mul, xrange(n, n-r, -1))
denom = reduce(op.mul, xrange(1, r+1))
return numer//denom
def strandlength_distribution(NumCells,r,MaxLength):
b = r
g = 1/(float(NumCells)+1)
geometric = [( ((b*(1-g))/(b+g*(1-b)))**k * (g/(b+g*(1-b))) ) for k in range(MaxLength)]
geometric.append(1-sum(geometric))
return geometric
def pull_from_strandlength_distribution(N,NumCells,r,MaxLength):
unifs = np.random.uniform(0,1,N)
geometric = strandlength_distribution(NumCells,r,MaxLength)
geometric_cum = np.cumsum(geometric)
lengths = [np.where(unif < geometric_cum)[0][0] for unif in unifs]
return lengths
## Naive method of simulating all strands ##
def old_motif_freq(b,motif,numCells,r,maxLength):
sl_dist = strandlength_distribution(numCells,r,maxLength)
motiffreq = 0
for k in range(len(motif),maxLength+1):
lengthlist = map(''.join, itertools.product('10', repeat=k))
lengthlist = [string for string in lengthlist if motif in string]
strand_probs = [b**(s.count('0'))*(1-b)**(s.count('1')) for s in lengthlist]
motiffreq = motiffreq + sl_dist[k]*sum([strand_probs[s] for s in range(len(lengthlist)) if motif in lengthlist[s]])
return motiffreq, motiffreq/(1-sl_dist[0])
## Faster Brute force method of simulating the strands ##
def motif_freq(b,motif,numCells,r,maxLength):
sl_dist = strandlength_distribution(numCells,r,maxLength)
motiffreq = 0
motif_containing_strands = [motif]
last_length = len(motif)
last_length_list = [motif]
for k in range(1,maxLength+1-len(motif)):
# k is non-motif digits
lengthlist = map(''.join, itertools.product('10', repeat=k))
new_length_0 = [s + '0' for s in last_length_list]
new_length_1 = [s + '1' for s in last_length_list]
new_length = [s + motif for s in lengthlist]
last_length_list = new_length_0 + new_length_1
combined = last_length_list + new_length
last_length_list = list(np.unique(last_length_list + new_length))
motif_containing_strands += last_length_list
strand_probs = [sl_dist[len(s)]*b**(s.count('0'))*(1-b)**(s.count('1')) for s in motif_containing_strands]
motiffreq = sum(strand_probs)
return motiffreq, motiffreq/(1-sl_dist[0])
## Analytical Derivation for non-overlapping motifs ##
def motif_freq_analytical(b,motif,numCells,r,maxLength):
sl_dist = strandlength_distribution(numCells,r,maxLength)
m0 = motif.count('0')
l = len(motif)
motiffreq = 0
for lam in range(l,maxLength+1):
this_length_prob = 0
this_length_prob += (lam-l+1)*b**m0*(1-b)**(l-m0)*sum([ncr(lam-l,k)*b**k*(1-b)**(lam-l-k) for k in range(lam-l+1)])
if np.floor(lam/l) >= 2:
for beta in range(2,np.int(np.floor(lam/l))+1):
this_length_prob += sum([(-1)**(beta+1)*ncr(lam-beta*l+beta,beta)*b**(beta*m0)*(1-b)**(beta*(l-m0))*sum([ncr(lam-beta*l,k)*b**k*(1-b)**(lam-beta*l-k) for k in range(lam-beta*l+1)])])
motiffreq += sl_dist[lam]*this_length_prob
# print motiffreq
return motiffreq, motiffreq/(1-sl_dist[0])
def multiple_overlap_detector(motif):
l = len(motif)
o_len = [0]
o0 = [0]
for iterator in range(1,l):
if motif[:l-iterator] == motif[iterator:]:
o_len.append(l-iterator)
o0.append(motif[:l-iterator].count('0'))
if len(o_len) == 0:
o_len.append(0)
o0.append(0)
return o_len, o0
## Analytical Derivation for motifs with multiple overlaps ##
def motif_freq_analytical_multipleoverlap(b,motif,numCells,r,maxLength):
sl_dist = strandlength_distribution(numCells,r,maxLength)
m0 = motif.count('0')
l = len(motif)
o_len_list, o0_list = multiple_overlap_detector(motif)
motiffreq = 0
for lam in range(l,maxLength+1):
this_length_prob = 0
for o_len, o0 in zip(o_len_list,o0_list):
if o_len == 0:
if np.floor(lam/l) >= 1:
for beta in range(1,np.int(np.floor(lam/l))+1):
this_length_prob += (-1)**(beta+1)*ncr(lam-beta*l+beta,beta)*b**(beta*m0)*(1-b)**(beta*(l-m0))
else:
for alpha in range(1,np.int(np.floor((lam-(l+l-o_len))/l))+2):
if alpha+np.int(np.floor((lam-alpha*l)/(l-o_len))) >= alpha:
for beta in range(alpha+1,alpha+np.int(np.floor((lam-alpha*l)/(l-o_len)))+1):
this_length_prob += (-1)**(beta+1)*(alpha**(beta-alpha)*ncr(lam-alpha*l-(beta-alpha)*(l-o_len)+alpha,alpha))*b**(alpha*m0+(beta-alpha)*(m0-o0))*(1-b)**(alpha*(l-m0)+(beta-alpha)*(l-m0-(o_len-o0)))
motiffreq += sl_dist[lam]*this_length_prob
return motiffreq, motiffreq/(1-sl_dist[0])
def cells_with_motif(b,motif,numCells,r,maxLength,numStrands):
motif_prob = motif_freq_analytical(b,motif,numCells,r,maxLength)[0]
p_cell_with_motif = 1-(1-motif_prob)**numStrands
return p_cell_with_motif
def steady_state_motif_dist(b,motif,numCells,r,maxLength,numStrands):
motif_prob, non_empty_motif_freq = motif_freq_analytical(b,motif,numCells,r,maxLength)
p_cell_with_motif = 1-(1-motif_prob)**numStrands
p_cell_with_motif = cells_with_motif(0.5,motif,numCells,r,maxLength,numStrands)
ss_motif_freq = p_cell_with_motif*non_empty_motif_freq
return ss_motif_freq
In [28]:
ax = plt.figure()
plt.subplot(221)
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10000',100,0.05,5)[1] for b in np.linspace(0.0,1,30)],'bo',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10000',100,0.05,5)[1] for b in np.linspace(0.0,1,100)],'b',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10000',100,0.05,10)[1] for b in np.linspace(0.0,1,30)],'go',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10000',100,0.05,10)[1] for b in np.linspace(0.0,1,100)],'g',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10000',100,0.05,15)[1] for b in np.linspace(0.0,1,30)],'ro',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10000',100,0.05,15)[1] for b in np.linspace(0.0,1,100)],'r',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10000',100,0.05,20)[1] for b in np.linspace(0.0,1,30)],'mo',label='Count L=20')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10000',100,0.05,20)[1] for b in np.linspace(0.0,1,100)],'m',label='Analytical L=20')
plt.ylabel('Prob. strand has motif')
plt.title('10000')
plt.subplot(222)
# plt.figure()
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10101',100,0.05,5)[1] for b in np.linspace(0.0,1,30)],'bo',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10101',100,0.05,5)[1] for b in np.linspace(0.0,1,100)],'b',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10101',100,0.05,10)[1] for b in np.linspace(0.0,1,30)],'go',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10101',100,0.05,10)[1] for b in np.linspace(0.0,1,100)],'g',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10101',100,0.05,15)[1] for b in np.linspace(0.0,1,30)],'ro',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10101',100,0.05,15)[1] for b in np.linspace(0.0,1,100)],'r',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'10101',100,0.05,20)[1] for b in np.linspace(0.0,1,30)],'mo',label='Count L=20')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'10101',100,0.05,20)[1] for b in np.linspace(0.0,1,100)],'m',label='Analytical L=20')
plt.title('10101')
# plt.figure()
plt.subplot(223)
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'101',100,0.05,5)[1] for b in np.linspace(0.0,1,30)],'bo',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'101',100,0.05,5)[1] for b in np.linspace(0.0,1,100)],'b',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'101',100,0.05,10)[1] for b in np.linspace(0.0,1,30)],'go',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'101',100,0.05,10)[1] for b in np.linspace(0.0,1,100)],'g',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'101',100,0.05,15)[1] for b in np.linspace(0.0,1,30)],'ro',label='Brute Force Count')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'101',100,0.05,15)[1] for b in np.linspace(0.0,1,100)],'r',label='Analytical')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'101',100,0.05,20)[1] for b in np.linspace(0.0,1,30)],'mo',label='Count L=20')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'101',100,0.05,20)[1] for b in np.linspace(0.0,1,100)],'m',label='Analytical L=20')
plt.title('101')
plt.ylabel('Prob. strand has motif')
plt.xlabel('$b$')
# plt.figure()
ax = plt.subplot(224)
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'11001',100,0.05,5)[1] for b in np.linspace(0.0,1,30)],'bo',label='Count L=5')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'11001',100,0.05,5)[1] for b in np.linspace(0.0,1,100)],'b',label='Analytical L=5')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'11001',100,0.05,10)[1] for b in np.linspace(0.0,1,30)],'go',label='Count L=10')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'11001',100,0.05,10)[1] for b in np.linspace(0.0,1,100)],'g',label='Analytical L=10')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'11001',100,0.05,15)[1] for b in np.linspace(0.0,1,30)],'ro',label='Count L=15')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'11001',100,0.05,15)[1] for b in np.linspace(0.0,1,100)],'r',label='Analytical L=15')
plt.plot(np.linspace(0.0,1,30),[motif_freq(b,'11001',100,0.05,20)[1] for b in np.linspace(0.0,1,30)],'mo',label='Count L=20')
plt.plot(np.linspace(0.0,1,100),[motif_freq_analytical_multipleoverlap(b,'11001',100,0.05,20)[1] for b in np.linspace(0.0,1,100)],'m',label='Analytical L=20')
# ax.text(-0.1, 0, 'd',
# size=20, weight='bold')
plt.title('11001')
plt.xlabel('$b$')
# ax.axis["right"].set_visible(False)
# ax.axis["top"].set_visible(False)
sns.despine(top=True,right=True)
plt.legend(loc='center left',ncol=1,fontsize=13,bbox_to_anchor=(1, 1.2))
plt.savefig('analytical_comparison_many.pdf',bbox_inches='tight')
In [14]:
multiple_overlap_detector('0000')
Out[14]: