Plotting how a diffusion function will alter the GC distribution of fragments

  • Buoyant density profile of heterogeneous DNA:
    • Gaussian distribution
      • mean = 0.098[G+C] + 1.66
      • sigma = sigma[GC] + sigma[diffusion]
      • sigma[diffusion] = 44.5 (kb) / fragment_length

In [37]:
from ggplot import *
import pandas as pd
import itertools
import scipy
import matplotlib.cm as cm

In [7]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

In [8]:
def diffusion_simple(frag_len):
    return 44.5 / frag_len

In [9]:
frag_lens = pd.DataFrame({'frag_lens' : range(100, 50000, 100)})

In [13]:
ggplot(frag_lens, aes(x='frag_lens')) +\
    stat_function(fun=lambda x: 44 / x, color='blue') +\
    stat_function(fun=lambda x: 44.5 / x, color='red') +\
    stat_function(fun=lambda x: 45 / x, color='green') +\
    geom_vline(xintercept=4000) +\
    labs(x='fragment length', y='sigma of BD distribution')


Out[13]:
<ggplot: (8765531572233)>

Notes:

  • The slight GC bias (44 for G+C of < 30 & 45 for G+C of > 70) doesn't make much of a difference.

Simulating normal distributions for binned fragment lengths


In [133]:
frag_lens = range(4000, 50000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]

data = pd.DataFrame({'x':np.arange(-10, 11)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='sigma: [%G+C]', y='total')

for sig,col in zip(sigma,cols):
    p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)
    
p


Out[133]:
<ggplot: (8765524166277)>

In [130]:
frag_lens = range(4000, 50000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]
sigma = [x/100.0 * 0.098 for x in sigma]

data = pd.DataFrame({'x':np.arange(-0.015,0.016,0.001)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='sigma: [BD]', y='total')

for sig,col in zip(sigma,cols):
    p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)
    
p


Out[130]:
<ggplot: (8765523864885)>

lengths of 4-14 kb


In [129]:
frag_lens = range(1000, 14000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]

data = pd.DataFrame({'x':np.arange(-20,21)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='Gaussian of homogenous G+C fragments, [%G+C]', y='total')

for sig,col in zip(sigma,cols):
    p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)
    
p


Out[129]:
<ggplot: (8765530605633)>

Notes:

  • even at frag size of 4kb, approx. swing of %G+C of +/- 3.35% (BD range of ~0.003)
  • At frag size of 1kb, approx. swing of %G+C of +/- 6.7% (BD range of ~0.006)

Plotting example distributions of fragments of same G+C, but different lengths


In [144]:
frag_lens = [500, 1000, 4000, 10000, 50000]
sigma = [sqrt(44500.0 / x) for x in frag_lens]

GC = [50] * len(frag_lens)


data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='%GC', y='total') +\
    theme_bw()


for i,BD in enumerate(GC):
    p = p + stat_function(fun=dnorm, args={'mean':GC, 'var':sigma[i]}, color=cols[i])
    
p


Out[144]:
<ggplot: (8765529394821)>

In [143]:
frag_lens = [500, 1000, 4000, 10000, 50000]
sigma = [sqrt(44500.0 / x) for x in frag_lens]
sigma = [x/100.0 * 0.098 for x in sigma]

GC = [0.5] * len(frag_lens)
BD = [x * 0.098 + 1.66 for x in GC]

data = pd.DataFrame({'x':np.arange(1.66,1.76,0.01)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='Buoyant density', y='total') +\
    theme_bw()


for i,BD in enumerate(BD):
    p = p + stat_function(fun=dnorm, args={'mean':BD, 'var':sigma[i]}, color=cols[i])
    
p


Out[143]:
<ggplot: (8765530724689)>

Simulating convolution of G+C heterogeneous fragments & diffusion (lengths: 1-50 kb)


In [178]:
GC_mean = 50
GC_sigma = 5
frag_lens = [1000, 4000, 10000, 50000, 2000000]
frag_len_sigma = [44500.0/x for x in frag_lens]

data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='%G+C', y='density') +\
    theme_bw()

for i in xrange(len(frag_lens)):
    sigma_total = GC_sigma + frag_len_sigma[i]
    p = p + stat_function(fun=dnorm, args={'mean':GC_mean, 'var':sigma_total}, color=cols[i])
    
p


Out[178]:
<ggplot: (8765522462153)>

In [182]:
# just 4kb vs 2000kb
GC_mean = 50
GC_sigma = 5
frag_lens = [4000,20000000]
frag_len_sigma = [44500.0/x for x in frag_lens]

data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
    labs(x='%G+C', y='density') +\
    theme_bw()

for i in xrange(len(frag_lens)):
    sigma_total = GC_sigma + frag_len_sigma[i]
    p = p + stat_function(fun=dnorm, args={'mean':GC_mean, 'var':sigma_total}, color=cols[i])
    
p


Out[182]:
<ggplot: (8765529357841)>

Simulating G+C of fragments while accounting for diffusion


In [14]:
def diffusion(GC, frag_len):
    if GC < 30:
        return 44 / frag_len
    elif GC > 70:
        return 45 / frag_len
    else:
        return 44.5 / frag_len

In [27]:
# fragment GC as a normal distribution
n = 10000
frag_mean = 5000
frag_stdev = 1000

gc_vals = np.random.normal(loc=50, scale=10, size=n)
frag_lens = np.random.normal(loc=frag_mean, scale=frag_stdev, size=n)

In [28]:
# getting sigmas for each fragment
sigmas = itertools.starmap(diffusion, zip(gc_vals, frag_lens))
sigmas = [x for x in sigmas]

In [29]:
# sampling a GC value for each normal distribution with fragment
def GC_wDif(GC, sigma):
    return np.random.normal(loc=GC, scale=sigma, size=1)

GC_wDiff = itertools.starmap(GC_wDif, zip(gc_vals, sigmas))
GC_wDiff = [x for x in GC_wDiff]

In [30]:
df = pd.DataFrame({'GC' : gc_vals, 'frag_lens' : frag_lens, 'GC_wDiff' : GC_wDiff})

In [31]:
# plotting
ggplot(df, aes(x='GC', y='GC_wDiff')) +\
    stat_smooth()


Out[31]:
<ggplot: (8765531631653)>

In [20]:
#x_vals = pd.DataFrame({'x' : np.arange(100,10000, 100)})

sigma_vals = [44.5 / x for x in [100, 1000, 10000, 50000, 100000]]

print sigma_vals


data = pd.DataFrame({'x':np.arange(-5,6)})
    
def dnorm(x, mean, var):
    return scipy.stats.norm(mean,var).pdf(x)    
    
ggplot(data, aes(x='x')) +\
    stat_function(fun=dnorm, args={'mean':0.0, 'var':0.445}, color='blue') +\
    stat_function(fun=dnorm, args={'mean':0.0, 'var':0.0445}, color='red') +\
    stat_function(fun=dnorm, args={'mean':0.0, 'var':0.00445}, color='green')


[0.445, 0.0445, 0.00445, 0.00089, 0.000445]
Out[20]:
<ggplot: (8765531222113)>

In [21]:
def gc2bd(gc):
    return 1.66 + gc/100.0*0.098 

gc = np.arange(1,100)
bd = [gc2bd(x) for x in gc]
data = pd.DataFrame({'gc' : gc, 'bd' : bd})

ggplot(data, aes(x='gc', y='bd')) +\
    geom_point()


Out[21]:
<ggplot: (8765531047941)>

In [ ]: