Description:

  • Playing around with convolution

In [74]:
import numpy as np
from ggplot import *
import matplotlib as plt
import pandas as pd
import scipy




In [75]:
%load_ext rpy2.ipython
%pylab inline


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['plt', 'xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

convolution of 2 normal distributions


In [76]:
n = 10000
d1 = np.random.normal(loc=50, scale=1, size=n)   
d2 = np.random.normal(loc=0, scale=0.01, size=n)  # err

df = pd.DataFrame({'d1':d1, 'd2':d2})

In [77]:
p1 = ggplot(df, aes(d1)) +\
    geom_density() +\
    labs(x='value', y='density') +\
    theme_matplotlib(rc={"figure.figsize": "5, "})

p2 = ggplot(df, aes(d2)) +\
    geom_density() +\
    labs(x='value', y='density') +\
    theme_matplotlib(rc={"figure.figsize": "5, 3"})

print p1
print p2


<ggplot: (8744866464809)>
<ggplot: (8744866877765)>

In [78]:
ggplot(df, aes(d1,d2)) +\
    geom_point() +\
    labs(x='d1', y='d2')


Out[78]:
<ggplot: (8744866504609)>

In [79]:
d12 = np.convolve(d1, d2)
df12 = pd.DataFrame(d12)
df12.columns = ['d12']

p1.draw()
p2.draw()

ggplot(df12, aes(d12)) +\
    geom_density() +\
    labs(x='value', y='density') +\
    theme_matplotlib(rc={"figure.figsize": "5, 3"})


Out[79]:
<ggplot: (8744866781749)>

Trying convolution of GC values with diffusion


In [80]:
def diffusion(GC, frag_len):
    if GC < 30:
        return 44 / frag_len
    elif GC > 70:
        return 45 / frag_len
    else:
        return 44.5 / frag_len

In [81]:
n = 100000
gc_vals = np.random.normal(loc=50, scale=10, size=n)   
frag_lens = np.random.normal(loc=10000, scale=500, size=n)   
err = np.random.normal(loc=0, scale=44.5 / mean(frag_lens), size=n)


df = pd.DataFrame({'gc':gc_vals, 'frag_len':frag_lens, 'err':err})

In [82]:
ggplot(df, aes(x='gc', y='err')) +\
    geom_point()


Out[82]:
<ggplot: (8744866841465)>

In [83]:
# convolving
gc_wDiff = np.convolve(gc_vals, err, mode='same')

data = pd.DataFrame({'x':gc_wDiff})

ggplot(data, aes(x='x')) +\
    geom_density()


Out[83]:
<ggplot: (8744866744357)>

In [119]:
gc_vals = scipy.stats.norm(loc=50, scale=10)
errscale = 6
err = scipy.stats.norm(loc=0, scale=errscale)

#NB step-size determines precision of approximation
delta = 1e-4
big_grid = np.arange(-100,100,delta)

pmf1 = gc_vals.pdf(big_grid)*delta
pmf2 = err.pdf(big_grid)*delta

In [120]:
conv_pmf = scipy.signal.fftconvolve(pmf1, pmf2, 'same')

print "Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):"
print len(big_grid), sum(err.pdf(big_grid)*delta), sum(gc_vals.pdf(big_grid)*delta), sum(conv_pmf)

conv_pmf = conv_pmf/sum(conv_pmf)


Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):
2000000 0.999999999967 0.999999713308 0.999990852849

In [121]:
matplotlib.rcParams['figure.figsize'] = [8,5]
plt.plot(big_grid,pmf1, label='GC values')
plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')


Out[121]:
(<matplotlib.legend.Legend at 0x7f41287c9d90>,
 <matplotlib.text.Text at 0x7f41287defd0>)

In [122]:
matplotlib.rcParams['figure.figsize'] = [8,5]
plt.plot(big_grid,pmf1, label='GC values')
#plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')


Out[122]:
(<matplotlib.legend.Legend at 0x7f4128719c50>,
 <matplotlib.text.Text at 0x7f41287314d0>)

In [ ]:
gc_vals = scipy.stats.norm(loc=50, scale=10)
errscale = 6
err = scipy.stats.norm(loc=0, scale=errscale)

In [ ]:


In [106]:
import scipy.stats as stats

In [110]:
simple = stats.uniform(loc=50,scale=3)
errscale = 0.25
err = stats.norm(loc=0,scale=errscale)

# NB Kernel support array **MUST** be symmetric about centre of the kernel (error PDF) for this to work right. 
# Support also needs to extend about any significant areas of the component PDFs.
# Here, we just define one massive support for both input PDF, and error PDF (kernel)
# But we can do much better (see later)

#NB step-size determines precision of approximation
delta = 1e-4
big_grid = np.arange(-100,100,delta)

# Cannot analytically convolve continuous PDFs, in general.
# So we now make a probability mass function on a fine grid 
# - a discrete approximation to the PDF, amenable to FFT...
pmf1 = simple.pdf(big_grid)*delta
pmf2 = err.pdf(big_grid)*delta
conv_pmf = scipy.signal.fftconvolve(pmf1,pmf2,'same') # Convolved probability mass function
print "Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):"
print len(big_grid), sum(err.pdf(big_grid)*delta), sum(simple.pdf(big_grid)*delta), sum(conv_pmf)
conv_pmf = conv_pmf/sum(conv_pmf)

plt.plot(big_grid,pmf1, label='Tophat')
plt.plot(big_grid,pmf2, label='Gaussian error')
plt.plot(big_grid,conv_pmf, label='Sum')
plt.xlim(-3,max(big_grid))
plt.legend(loc='best'), plt.suptitle('PMFs')


Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):
2000000 0.999999999967 1.0 0.999999999967
Out[110]:
(<matplotlib.legend.Legend at 0x7f4128db89d0>,
 <matplotlib.text.Text at 0x7f4128d17710>)

In [ ]: