Window utilities: performance and memory profiling

Setup


In [1]:
import itertools
import numpy as np
import sys
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.insert(0, '../..')
%load_ext memory_profiler
%load_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.stats

In [2]:
pos = np.unique(np.random.randint(0, 100000000, size=10000000))
pos.size, pos.dtype


Out[2]:
(9516285, dtype('int64'))

In [3]:
values = np.random.randint(0, 60, size=pos.size)
values


Out[3]:
array([30,  8,  8, ...,  7,  0, 39])

Moving statistic


In [4]:
%timeit allel.stats.moving_statistic(values, statistic=np.sum, size=10000)


100 loops, best of 3: 12.9 ms per loop

In [5]:
%timeit scipy.stats.binned_statistic(np.arange(values.size), values, statistic='sum', bins=np.arange(0, values.size, 10000))


1 loops, best of 3: 3.2 s per loop

In [6]:
%memit allel.stats.moving_statistic(values, statistic=np.sum, size=10000)


peak memory: 247.72 MiB, increment: 0.03 MiB

In [7]:
%memit scipy.stats.binned_statistic(np.arange(values.size), values, statistic='sum', bins=np.arange(0, values.size, 10000))


peak memory: 537.11 MiB, increment: 289.14 MiB

In [8]:
y1 = allel.stats.moving_statistic(values, statistic=np.sum, size=10000)

In [9]:
y2, _, _ = scipy.stats.binned_statistic(np.arange(values.size), values, statistic='sum', bins=np.arange(0, values.size+10000, 10000))

In [10]:
np.array_equal(y1, y2)


Out[10]:
True

In [11]:
x = allel.stats.moving_statistic(pos, statistic=lambda b: (b[0] + b[-1])/2, size=10000)

In [12]:
plt.plot(x, y1);


Windowed statistic


In [4]:
h1, w, c = allel.stats.windowed_statistic(pos, values, np.sum, 10000, start=0, stop=10000000, step=10000)

In [5]:
h2, b, _ = scipy.stats.binned_statistic(pos, values, statistic='sum', bins=np.arange(0, 10000001, 10000))

In [15]:
h1.shape


Out[15]:
(1000,)

In [16]:
h2.shape


Out[16]:
(1000,)

In [17]:
np.array_equal(h1, h2)


Out[17]:
True

In [18]:
b


Out[18]:
array([        0.,     10000.,     20000., ...,   9980000.,   9990000.,
        10000000.])

In [19]:
w


Out[19]:
array([[       0,     9999],
       [   10000,    19999],
       [   20000,    29999],
       ..., 
       [ 9970000,  9979999],
       [ 9980000,  9989999],
       [ 9990000, 10000000]])

In [6]:
%timeit allel.stats.windowed_statistic(pos, values, np.sum, 100000, start=0, stop=100000000)


10 loops, best of 3: 37.2 ms per loop

In [7]:
%timeit scipy.stats.binned_statistic(pos, values, statistic='sum', bins=np.arange(0, 100000000, 100000))


1 loops, best of 3: 3.46 s per loop

In [8]:
%memit allel.stats.windowed_statistic(pos, values, np.sum, 100000, start=0, stop=100000000)


peak memory: 320.50 MiB, increment: 0.02 MiB

In [9]:
%memit scipy.stats.binned_statistic(pos, values, statistic='sum', bins=np.arange(0, 100000000, 100000))


peak memory: 574.59 MiB, increment: 253.83 MiB

Windowed statistic with accessibility


In [10]:
is_accessible = np.random.randint(0, 2, size=pos.max()).astype('b1')
is_accessible


Out[10]:
array([False, False,  True, ...,  True,  True,  True], dtype=bool)

In [11]:
h1, w, c = allel.stats.windowed_statistic(pos, values, np.sum, 10000, start=0, stop=10000000, step=10000)

In [16]:
%timeit allel.stats.per_base(h1, w, is_accessible=is_accessible)


100 loops, best of 3: 3.91 ms per loop

In [ ]: