# Plotting how a diffusion function will alter the GC distribution of fragments

• Buoyant density profile of heterogeneous DNA:
• Gaussian distribution
• mean = 0.098[G+C] + 1.66
• sigma = sigma[GC] + sigma[diffusion]
• sigma[diffusion] = 44.5 (kb) / fragment_length
``````

In [188]:

from ggplot import *
import pandas as pd
import itertools
import scipy
import matplotlib.cm as cm

``````
``````

In [189]:

%pylab inline

``````
``````

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

``````
``````

In [190]:

frag_lens = pd.DataFrame({'frag_lens' : range(100, 50000, 100)})

``````
``````

In [199]:

ggplot(frag_lens, aes(x='frag_lens')) +\
stat_function(fun=lambda x: sqrt(44000.0 / x), color='blue') +\
stat_function(fun=lambda x: sqrt(44500.0 / x), color='red') +\
stat_function(fun=lambda x: sqrt(45000.0 / x), color='green') +\
geom_vline(xintercept=4000, linetype='dashed', alpha=0.5) +\
labs(x='fragment length', y='sigma of G+C distribution')

``````
``````

Out[199]:

<ggplot: (8765522224041)>

``````

Notes:

• The slight GC bias (44 for G+C of < 30 & 45 for G+C of > 70) doesn't make much of a difference.

# Simulating normal distributions for binned fragment lengths

``````

In [203]:

frag_lens = range(4000, 50000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]

data = pd.DataFrame({'x':np.arange(-10, 11)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='Variance caused by diffusion (%G+C)', y='density')

for sig,col in zip(sigma,cols):
p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)

p

``````
``````

Out[203]:

<ggplot: (8765522493597)>

``````
``````

In [186]:

frag_lens = range(4000, 50000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]
sigma = [x/100.0 * 0.098 for x in sigma]

data = pd.DataFrame({'x':np.arange(-0.015,0.016,0.001)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='Variance caused by diffusion: [BD]', y='total')

for sig,col in zip(sigma,cols):
p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)

p

``````
``````

Out[186]:

<ggplot: (8765529357833)>

``````

### lengths of 4-14 kb

``````

In [185]:

frag_lens = range(1000, 14000, 1000)
sigma = [sqrt(44500.0 / x) for x in frag_lens]

data = pd.DataFrame({'x':np.arange(-20,21)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='Variance caused by diffusion (%G+C)', y='density')

for sig,col in zip(sigma,cols):
p = p + stat_function(fun=dnorm, args={'mean':0.0, 'var':sig}, color=col)

p

``````
``````

Out[185]:

<ggplot: (8765522860309)>

``````

Notes:

• even at frag size of 4kb, approx. swing of %G+C of +/- 3.35% (BD range of ~0.003)
• At frag size of 1kb, approx. swing of %G+C of +/- 6.7% (BD range of ~0.006)

## Plotting example distributions of fragments of same G+C, but different lengths

``````

In [144]:

frag_lens = [500, 1000, 4000, 10000, 50000]
sigma = [sqrt(44500.0 / x) for x in frag_lens]

GC = [50] * len(frag_lens)

data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='%GC', y='total') +\
theme_bw()

for i,BD in enumerate(GC):
p = p + stat_function(fun=dnorm, args={'mean':GC, 'var':sigma[i]}, color=cols[i])

p

``````
``````

Out[144]:

<ggplot: (8765529394821)>

``````
``````

In [143]:

frag_lens = [500, 1000, 4000, 10000, 50000]
sigma = [sqrt(44500.0 / x) for x in frag_lens]
sigma = [x/100.0 * 0.098 for x in sigma]

GC = [0.5] * len(frag_lens)
BD = [x * 0.098 + 1.66 for x in GC]

data = pd.DataFrame({'x':np.arange(1.66,1.76,0.01)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='Buoyant density', y='total') +\
theme_bw()

for i,BD in enumerate(BD):
p = p + stat_function(fun=dnorm, args={'mean':BD, 'var':sigma[i]}, color=cols[i])

p

``````
``````

Out[143]:

<ggplot: (8765530724689)>

``````

# Simulating convolution of G+C heterogeneous fragments & diffusion (lengths: 1-50 kb)

``````

In [178]:

GC_mean = 50
GC_sigma = 5
frag_lens = [1000, 4000, 10000, 50000, 2000000]
frag_len_sigma = [44500.0/x for x in frag_lens]

data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='%G+C', y='density') +\
theme_bw()

for i in xrange(len(frag_lens)):
sigma_total = GC_sigma + frag_len_sigma[i]
p = p + stat_function(fun=dnorm, args={'mean':GC_mean, 'var':sigma_total}, color=cols[i])

p

``````
``````

Out[178]:

<ggplot: (8765522462153)>

``````
``````

In [182]:

# just 4kb vs 2000kb
GC_mean = 50
GC_sigma = 5
frag_lens = [4000,20000000]
frag_len_sigma = [44500.0/x for x in frag_lens]

data = pd.DataFrame({'x':np.arange(0,101)})
cols = cm.rainbow(np.linspace(0,1,len(frag_lens)))

p = ggplot(data, aes(x='x')) +\
labs(x='%G+C', y='density') +\
theme_bw()

for i in xrange(len(frag_lens)):
sigma_total = GC_sigma + frag_len_sigma[i]
p = p + stat_function(fun=dnorm, args={'mean':GC_mean, 'var':sigma_total}, color=cols[i])

p

``````
``````

Out[182]:

<ggplot: (8765529357841)>

``````

# Simulating G+C of fragments while accounting for diffusion

``````

In [14]:

def diffusion(GC, frag_len):
if GC < 30:
return 44 / frag_len
elif GC > 70:
return 45 / frag_len
else:
return 44.5 / frag_len

``````
``````

In [27]:

# fragment GC as a normal distribution
n = 10000
frag_mean = 5000
frag_stdev = 1000

gc_vals = np.random.normal(loc=50, scale=10, size=n)
frag_lens = np.random.normal(loc=frag_mean, scale=frag_stdev, size=n)

``````
``````

In [28]:

# getting sigmas for each fragment
sigmas = itertools.starmap(diffusion, zip(gc_vals, frag_lens))
sigmas = [x for x in sigmas]

``````
``````

In [29]:

# sampling a GC value for each normal distribution with fragment
def GC_wDif(GC, sigma):
return np.random.normal(loc=GC, scale=sigma, size=1)

GC_wDiff = itertools.starmap(GC_wDif, zip(gc_vals, sigmas))
GC_wDiff = [x for x in GC_wDiff]

``````
``````

In [30]:

df = pd.DataFrame({'GC' : gc_vals, 'frag_lens' : frag_lens, 'GC_wDiff' : GC_wDiff})

``````
``````

In [31]:

# plotting
ggplot(df, aes(x='GC', y='GC_wDiff')) +\
stat_smooth()

``````
``````

Out[31]:

<ggplot: (8765531631653)>

``````
``````

In [20]:

#x_vals = pd.DataFrame({'x' : np.arange(100,10000, 100)})

sigma_vals = [44.5 / x for x in [100, 1000, 10000, 50000, 100000]]

print sigma_vals

data = pd.DataFrame({'x':np.arange(-5,6)})

def dnorm(x, mean, var):
return scipy.stats.norm(mean,var).pdf(x)

ggplot(data, aes(x='x')) +\
stat_function(fun=dnorm, args={'mean':0.0, 'var':0.445}, color='blue') +\
stat_function(fun=dnorm, args={'mean':0.0, 'var':0.0445}, color='red') +\
stat_function(fun=dnorm, args={'mean':0.0, 'var':0.00445}, color='green')

``````
``````

[0.445, 0.0445, 0.00445, 0.00089, 0.000445]

Out[20]:

<ggplot: (8765531222113)>

``````
``````

In [21]:

def gc2bd(gc):
return 1.66 + gc/100.0*0.098

gc = np.arange(1,100)
bd = [gc2bd(x) for x in gc]
data = pd.DataFrame({'gc' : gc, 'bd' : bd})

ggplot(data, aes(x='gc', y='bd')) +\
geom_point()

``````
``````

Out[21]:

<ggplot: (8765531047941)>

``````
``````

In [ ]:

``````