# Description:

• Playing around with convolution
``````

In [74]:

import numpy as np
from ggplot import *
import matplotlib as plt
import pandas as pd
import scipy

``````
``````

``````
``````

In [75]:

%pylab inline

``````
``````

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['plt', 'xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

``````

## convolution of 2 normal distributions

``````

In [76]:

n = 10000
d1 = np.random.normal(loc=50, scale=1, size=n)
d2 = np.random.normal(loc=0, scale=0.01, size=n)  # err

df = pd.DataFrame({'d1':d1, 'd2':d2})

``````
``````

In [77]:

p1 = ggplot(df, aes(d1)) +\
geom_density() +\
labs(x='value', y='density') +\
theme_matplotlib(rc={"figure.figsize": "5, "})

p2 = ggplot(df, aes(d2)) +\
geom_density() +\
labs(x='value', y='density') +\
theme_matplotlib(rc={"figure.figsize": "5, 3"})

print p1
print p2

``````
``````

<ggplot: (8744866464809)>

<ggplot: (8744866877765)>

``````
``````

In [78]:

ggplot(df, aes(d1,d2)) +\
geom_point() +\
labs(x='d1', y='d2')

``````
``````

Out[78]:

<ggplot: (8744866504609)>

``````
``````

In [79]:

d12 = np.convolve(d1, d2)
df12 = pd.DataFrame(d12)
df12.columns = ['d12']

p1.draw()
p2.draw()

ggplot(df12, aes(d12)) +\
geom_density() +\
labs(x='value', y='density') +\
theme_matplotlib(rc={"figure.figsize": "5, 3"})

``````
``````

Out[79]:

<ggplot: (8744866781749)>

``````

# Trying convolution of GC values with diffusion

``````

In [80]:

def diffusion(GC, frag_len):
if GC < 30:
return 44 / frag_len
elif GC > 70:
return 45 / frag_len
else:
return 44.5 / frag_len

``````
``````

In [81]:

n = 100000
gc_vals = np.random.normal(loc=50, scale=10, size=n)
frag_lens = np.random.normal(loc=10000, scale=500, size=n)
err = np.random.normal(loc=0, scale=44.5 / mean(frag_lens), size=n)

df = pd.DataFrame({'gc':gc_vals, 'frag_len':frag_lens, 'err':err})

``````
``````

In [82]:

ggplot(df, aes(x='gc', y='err')) +\
geom_point()

``````
``````

Out[82]:

<ggplot: (8744866841465)>

``````
``````

In [83]:

# convolving
gc_wDiff = np.convolve(gc_vals, err, mode='same')

data = pd.DataFrame({'x':gc_wDiff})

ggplot(data, aes(x='x')) +\
geom_density()

``````
``````

Out[83]:

<ggplot: (8744866744357)>

``````

# Tutorial: http://nbviewer.ipython.org/github/timstaley/ipython-notebooks/blob/compiled/probabilistic_programming/convolving_distributions_illustration.ipynb

``````

In [119]:

gc_vals = scipy.stats.norm(loc=50, scale=10)
errscale = 6
err = scipy.stats.norm(loc=0, scale=errscale)

#NB step-size determines precision of approximation
delta = 1e-4
big_grid = np.arange(-100,100,delta)

pmf1 = gc_vals.pdf(big_grid)*delta
pmf2 = err.pdf(big_grid)*delta

``````
``````

In [120]:

conv_pmf = scipy.signal.fftconvolve(pmf1, pmf2, 'same')

print "Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):"
print len(big_grid), sum(err.pdf(big_grid)*delta), sum(gc_vals.pdf(big_grid)*delta), sum(conv_pmf)

conv_pmf = conv_pmf/sum(conv_pmf)

``````
``````

Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):
2000000 0.999999999967 0.999999713308 0.999990852849

``````
``````

In [121]:

matplotlib.rcParams['figure.figsize'] = [8,5]
plt.plot(big_grid,pmf1, label='GC values')
plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')

``````
``````

Out[121]:

(<matplotlib.legend.Legend at 0x7f41287c9d90>,
<matplotlib.text.Text at 0x7f41287defd0>)

``````
``````

In [122]:

matplotlib.rcParams['figure.figsize'] = [8,5]
plt.plot(big_grid,pmf1, label='GC values')
#plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')

``````
``````

Out[122]:

(<matplotlib.legend.Legend at 0x7f4128719c50>,
<matplotlib.text.Text at 0x7f41287314d0>)

``````
``````

In [ ]:

gc_vals = scipy.stats.norm(loc=50, scale=10)
errscale = 6
err = scipy.stats.norm(loc=0, scale=errscale)

``````
``````

In [ ]:

``````
``````

In [106]:

import scipy.stats as stats

``````
``````

In [110]:

simple = stats.uniform(loc=50,scale=3)
errscale = 0.25
err = stats.norm(loc=0,scale=errscale)

# NB Kernel support array **MUST** be symmetric about centre of the kernel (error PDF) for this to work right.
# Support also needs to extend about any significant areas of the component PDFs.
# Here, we just define one massive support for both input PDF, and error PDF (kernel)
# But we can do much better (see later)

#NB step-size determines precision of approximation
delta = 1e-4
big_grid = np.arange(-100,100,delta)

# Cannot analytically convolve continuous PDFs, in general.
# So we now make a probability mass function on a fine grid
# - a discrete approximation to the PDF, amenable to FFT...
pmf1 = simple.pdf(big_grid)*delta
pmf2 = err.pdf(big_grid)*delta
conv_pmf = scipy.signal.fftconvolve(pmf1,pmf2,'same') # Convolved probability mass function
print "Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):"
print len(big_grid), sum(err.pdf(big_grid)*delta), sum(simple.pdf(big_grid)*delta), sum(conv_pmf)
conv_pmf = conv_pmf/sum(conv_pmf)

plt.plot(big_grid,pmf1, label='Tophat')
plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')

``````
``````

Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):
2000000 0.999999999967 1.0 0.999999999967

Out[110]:

(<matplotlib.legend.Legend at 0x7f4128db89d0>,
<matplotlib.text.Text at 0x7f4128d17710>)

``````
``````

In [ ]:

``````