In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
def binarysize(n):
    return humanize.naturalsize(n, binary=True)
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel
%aimport allel.model
%aimport allel.bcolz
%aimport allel.stats
%aimport allel.plot
allel.__version__


Out[1]:
'0.8.0.dev4'

In [2]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 5000000, 100, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 2
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 2
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 3
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 3

In [3]:
g = allel.bcolz.GenotypeCArray(data, copy=False)
print(binarysize(g.nbytes))


953.7 MiB

In [4]:
ac = g.count_alleles()[:]
ac.shape


Out[4]:
(5000000, 4)

In [5]:
ac


Out[5]:
AlleleCountsArray((5000000, 4), dtype=int32)
[[161  11  15  13]
 [177   7   8   8]
 [170  10   7  13]
 ..., 
 [170   6  13  11]
 [168   9  12  11]
 [171   8  11  10]]

In [6]:
%timeit mpd = allel.stats.mean_pairwise_diversity(ac)


1 loops, best of 3: 335 ms per loop

In [7]:
import cProfile
cProfile.run('allel.stats.mean_pairwise_diversity(ac)', sort='time')


         36 function calls in 0.341 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.171    0.171    0.340    0.340 stats.py:401(mean_pairwise_diversity)
        2    0.160    0.080    0.160    0.080 {method 'reduce' of 'numpy.ufunc' objects}
        1    0.008    0.008    0.008    0.008 {built-in method where}
        1    0.001    0.001    0.341    0.341 <string>:1(<module>)
        1    0.000    0.000    0.341    0.341 {built-in method exec}
        2    0.000    0.000    0.000    0.000 numeric.py:2428(seterr)
        2    0.000    0.000    0.160    0.080 fromnumeric.py:1623(sum)
        1    0.000    0.000    0.000    0.000 contextlib.py:37(__init__)
        2    0.000    0.000    0.000    0.000 {built-in method isinstance}
        2    0.000    0.000    0.000    0.000 numeric.py:2524(geterr)
        2    0.000    0.000    0.000    0.000 util.py:11(ignore_invalid)
        1    0.000    0.000    0.000    0.000 contextlib.py:63(__exit__)
        1    0.000    0.000    0.000    0.000 {built-in method array}
        2    0.000    0.000    0.160    0.080 _methods.py:31(_sum)
        1    0.000    0.000    0.000    0.000 contextlib.py:124(helper)
        1    0.000    0.000    0.000    0.000 util.py:20(asarray_ndim)
        2    0.000    0.000    0.000    0.000 {built-in method seterrobj}
        1    0.000    0.000    0.000    0.000 numeric.py:394(asarray)
        1    0.000    0.000    0.000    0.000 contextlib.py:57(__enter__)
        4    0.000    0.000    0.000    0.000 {built-in method geterrobj}
        2    0.000    0.000    0.000    0.000 {built-in method next}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 {built-in method getattr}
        1    0.000    0.000    0.000    0.000 {method 'pop' of 'dict' objects}



In [8]:
pos = np.unique(np.random.randint(1, 100000000, size=n_variants*2))[:n_variants]
pos.shape, pos.min(), pos.max(), pos.dtype


Out[8]:
((5000000,), 3, 52528553, dtype('int64'))

In [11]:
pi, windows, n_bases, counts = allel.stats.windowed_diversity(pos, ac, size=20000, start=1)

In [12]:
plt.hist(pi, bins=20);



In [9]:
%timeit pi, windows, n_bases, counts = allel.stats.windowed_diversity(pos, ac, size=20000, start=1)


1 loops, best of 3: 396 ms per loop

In [16]:
cProfile.run('allel.stats.windowed_diversity(pos, ac, size=20000, start=1)', sort='time')


         31639 function calls in 0.453 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     2633    0.204    0.000    0.204    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        1    0.175    0.175    0.371    0.371 stats.py:401(mean_pairwise_diversity)
        4    0.032    0.008    0.035    0.009 model.py:2049(_check_input_data)
     5254    0.010    0.000    0.010    0.000 {method 'searchsorted' of 'numpy.ndarray' objects}
        2    0.008    0.004    0.008    0.004 {built-in method where}
        1    0.007    0.007    0.054    0.054 stats.py:219(windowed_statistic)
     2627    0.005    0.000    0.017    0.000 model.py:2280(locate_range)
     2629    0.003    0.000    0.206    0.000 fromnumeric.py:1623(sum)
     5254    0.002    0.000    0.012    0.000 fromnumeric.py:955(searchsorted)
       11    0.002    0.000    0.002    0.000 {built-in method array}
        1    0.001    0.001    0.001    0.001 stats.py:90(_make_position_windows)
        1    0.001    0.001    0.453    0.453 stats.py:741(windowed_diversity)
     2631    0.001    0.000    0.001    0.000 {built-in method isinstance}
     2629    0.001    0.000    0.203    0.000 _methods.py:31(_sum)
     7881    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.453    0.453 <string>:1(<module>)
        1    0.000    0.000    0.453    0.453 {built-in method exec}
        4    0.000    0.000    0.003    0.001 fromnumeric.py:1764(any)
        1    0.000    0.000    0.000    0.000 stats.py:346(per_base)
        2    0.000    0.000    0.035    0.018 model.py:2060(__new__)
        5    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
        2    0.000    0.000    0.000    0.000 util.py:20(asarray_ndim)
        2    0.000    0.000    0.015    0.008 {method 'view' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 function_base.py:1055(diff)
        2    0.000    0.000    0.000    0.000 contextlib.py:37(__init__)
        4    0.000    0.000    0.003    0.001 {method 'any' of 'numpy.ndarray' objects}
        4    0.000    0.000    0.000    0.000 numeric.py:2428(seterr)
        4    0.000    0.000    0.000    0.000 numeric.py:2524(geterr)
        2    0.000    0.000    0.015    0.008 model.py:2067(__array_finalize__)
        4    0.000    0.000    0.003    0.001 _methods.py:37(_any)
        4    0.000    0.000    0.000    0.000 util.py:11(ignore_invalid)
        2    0.000    0.000    0.000    0.000 contextlib.py:124(helper)
        2    0.000    0.000    0.000    0.000 contextlib.py:63(__exit__)
        4    0.000    0.000    0.002    0.000 numeric.py:394(asarray)
        1    0.000    0.000    0.000    0.000 model.py:2094(__getitem__)
        4    0.000    0.000    0.000    0.000 {built-in method seterrobj}
        4    0.000    0.000    0.000    0.000 {built-in method next}
        8    0.000    0.000    0.000    0.000 {built-in method geterrobj}
        2    0.000    0.000    0.000    0.000 contextlib.py:57(__enter__)
        1    0.000    0.000    0.000    0.000 {method 'reshape' of 'numpy.ndarray' objects}
        2    0.000    0.000    0.000    0.000 {built-in method getattr}
        1    0.000    0.000    0.000    0.000 {built-in method hasattr}
        3    0.000    0.000    0.000    0.000 {built-in method len}
        2    0.000    0.000    0.000    0.000 {method 'pop' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



In [ ]: