Intersect positions


In [1]:
import numpy as np
%reload_ext cythonmagic
%reload_ext memory_profiler

In [2]:
pos1 = np.random.randint(0, 10000000, size=10000000)
pos1 = np.unique(pos1)
pos1.shape


Out[2]:
(6320937,)

In [3]:
pos2 = np.random.randint(0, 10000000, size=10000000)
pos2 = np.unique(pos2)
pos2.shape


Out[3]:
(6322189,)

In [4]:
pos1


Out[4]:
array([      0,       1,       2, ..., 9999996, 9999998, 9999999])

In [5]:
pos2


Out[5]:
array([      2,       4,       6, ..., 9999994, 9999997, 9999999])

In [6]:
def intersect_positions_impl1(pos1, pos2):
    cond1 = np.in1d(pos1, pos2)
    cond2 = np.in1d(pos2, pos1)
    return cond1, cond2

In [7]:
cond1, cond2 = intersect_positions_impl1(pos1, pos2)

In [8]:
pos1[cond1]


Out[8]:
array([      2,       4,       6, ..., 9999993, 9999994, 9999999])

In [9]:
pos2[cond2]


Out[9]:
array([      2,       4,       6, ..., 9999993, 9999994, 9999999])

In [10]:
np.count_nonzero(cond1)


Out[10]:
3997238

In [11]:
%timeit intersect_positions_impl1(pos1, pos2)


1 loops, best of 3: 3.35 s per loop

In [12]:
%memit intersect_positions_impl1(pos1, pos2)


peak memory: 835.43 MiB, increment: 578.62 MiB

In [13]:
%%cython

import numpy as np
cimport numpy as np

def intersect_positions_impl2(np.ndarray[np.int64_t, ndim=1] pos1, 
                              np.ndarray[np.int64_t, ndim=1] pos2):
    cdef int i = 0
    cdef int j = 0
    cdef np.ndarray[np.uint8_t, ndim=1] cond1
    cdef np.ndarray[np.uint8_t, ndim=1] cond2
    cond1 = np.zeros((pos1.size, ), dtype=np.uint8)
    cond2 = np.zeros((pos2.size, ), dtype=np.uint8)
    cdef int m = pos1.size
    cdef int n = pos2.size
    
    while i < m and j < n:
        if pos1[i] < pos2[j]:
            # advance left
            i += 1
        elif pos1[i] > pos2[j]:
            # advance right
            j += 1
        else:
            # match
            cond1[i] = 1
            cond2[j] = 1
            # advance both
            i += 1
            j += 1
            
    return cond1.astype('b1'), cond2.astype('b1')

In [14]:
cond1, cond2 = intersect_positions_impl2(pos1, pos2)

In [15]:
pos1[cond1]


Out[15]:
array([      2,       4,       6, ..., 9999993, 9999994, 9999999])

In [16]:
pos2[cond2]


Out[16]:
array([      2,       4,       6, ..., 9999993, 9999994, 9999999])

In [17]:
np.count_nonzero(cond1)


Out[17]:
3997238

In [18]:
%timeit intersect_positions_impl2(pos1, pos2)


10 loops, best of 3: 95.5 ms per loop

In [19]:
%memit intersect_positions_impl2(pos1, pos2)


peak memory: 300.25 MiB, increment: 0.00 MiB

In [20]:
pos1.nbytes / 1.e6


Out[20]:
50.567496

Block apply


In [21]:
# dev imports
import sys
import numpy as np
%reload_ext memory_profiler
sys.path.insert(0, '../src')
%reload_ext autoreload
%autoreload 1
%aimport anhima.gt
%aimport anhima.af
%aimport anhima.util

In [22]:
genotypes = np.random.randint(0, 2, size=(100000, 1000, 2)).astype('i1')
genotypes.shape, genotypes.nbytes / 1e6


Out[22]:
((100000, 1000, 2), 200.0)

In [23]:
gn = anhima.gt.as_n_alt(genotypes)

In [24]:
gn.shape, gn.nbytes / 1e6


Out[24]:
((100000, 1000), 100.0)

In [25]:
gn2 = anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)

In [26]:
gn2.shape, gn2.nbytes / 1e6


Out[26]:
((100000, 1000), 100.0)

In [27]:
np.array_equal(gn, gn2)


Out[27]:
True

In [28]:
ac = anhima.af.allele_counts(genotypes)

In [29]:
ac.shape


Out[29]:
(100000, 2)

In [30]:
ac2 = anhima.util.block_apply(anhima.af.allele_counts, genotypes, axis=0, block_size=1001)

In [31]:
ac2.shape


Out[31]:
(100000, 2)

In [32]:
np.array_equal(ac, ac2)


Out[32]:
True

In [33]:
%timeit anhima.gt.as_n_alt(genotypes)


1 loops, best of 3: 1.46 s per loop

In [34]:
%timeit anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)


1 loops, best of 3: 1.47 s per loop

In [35]:
%timeit anhima.af.allele_counts(genotypes)


1 loops, best of 3: 1.06 s per loop

In [36]:
%timeit anhima.util.block_apply(anhima.af.allele_counts, genotypes, axis=0, block_size=1001)


1 loops, best of 3: 1.1 s per loop

In [37]:
%memit anhima.gt.as_n_alt(genotypes)


peak memory: 989.54 MiB, increment: 285.96 MiB

In [38]:
%memit anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)


peak memory: 797.34 MiB, increment: 93.76 MiB

In [39]:
packed = anhima.gt.pack_diploid(genotypes)

In [40]:
packed2 = anhima.util.block_apply(anhima.gt.pack_diploid, genotypes, axis=0, block_size=1001)

In [41]:
np.array_equal(packed, packed2)


Out[41]:
True

In [42]:
%memit anhima.gt.pack_diploid(genotypes)


peak memory: 1371.17 MiB, increment: 476.85 MiB

In [43]:
%memit anhima.util.block_apply(anhima.gt.pack_diploid, genotypes, axis=0, block_size=1001)


peak memory: 987.47 MiB, increment: 93.15 MiB

In [44]:
packed.nbytes / 1e6


Out[44]:
100.0

In [45]:
unpacked = anhima.gt.unpack_diploid(packed)

In [46]:
unpacked2 = anhima.util.block_apply(anhima.gt.unpack_diploid, packed2, axis=0, block_size=1001)

In [47]:
np.array_equal(unpacked, unpacked2)


Out[47]:
True

In [48]:
%memit anhima.gt.unpack_diploid(packed)


peak memory: 1632.62 MiB, increment: 356.82 MiB

In [49]:
%memit anhima.util.block_apply(anhima.gt.unpack_diploid, packed2, axis=0, block_size=1001)


peak memory: 1463.05 MiB, increment: 187.25 MiB

In [ ]: