Description:

  • Determine which is faster:
    • creating a bin for each point in order to blur it
    • initializing distributions for bins of values and selecting them via an interval tree

In [30]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['xlim', 'size', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

In [31]:
from ggplot import *
import numpy as np
import pandas as pd
import scipy

Data set


In [164]:
size=100000
gc_vals = np.random.normal(loc=50, scale=10, size=size)
frag_lens = np.random.normal(loc=10000, scale=500, size=size)

Creating distribution for each point


In [165]:
def gc_wDiff(gc, frag_len):
    return gc + np.random.normal(loc=0, scale=44500/frag_len, size=1)[0]


gc_wDiff_vals = [gc_wDiff(gc, frag_len) for gc,frag_len in zip(gc_vals, frag_lens)]

Adding error term --> cauchy distribution function


In [166]:
errScale = 0.001

# adding noise as cauchy
gc_wDiff_wNoise_vals = np.array(gc_wDiff_vals) + scipy.stats.cauchy(loc=0,scale=errScale).rvs(len(gc_wDiff_vals))

In [167]:
data = pd.DataFrame({'gc':gc_vals, 'gc_wDiff':gc_wDiff_vals, 'gc_wDiff_wNoise':gc_wDiff_wNoise_vals})

mpl.rcParams['figure.figsize'] = [0.5,1]

p1 = ggplot(data, aes()) +\
    geom_density(aes(x='gc', color='blue'), ) 
    
p2 = ggplot(data, aes()) +\
    geom_density(aes(x='gc_wDiff', color='red'), ) 

p3 = ggplot(data, aes()) +\
    geom_density(aes(x='gc_wDiff_wNoise', color='green'), ) 
    
print p1
print p2
print p3
    
#    geom_density(aes(x='gc_wDiff', color='red'), ) +\
#    geom_density(aes(x='gc_wDiff_wNoise', color='green'), )
#    theme_matplotlib(rc={"figure.figsize": "5, 3"})


<ggplot: (8739959097945)>
<ggplot: (8739964223133)>
<ggplot: (8739959098045)>

In [168]:
print np.std(gc_vals)
print np.std(gc_wDiff_vals)
print np.std(gc_wDiff_wNoise_vals)


10.0210860831
10.9510003974
10.9829727914

In [159]:


In [ ]: