Intersect positions



In [1]:

    
import numpy as np
%reload_ext cythonmagic
%reload_ext memory_profiler



In [2]:

    
pos1 = np.random.randint(0, 10000000, size=10000000)
pos1 = np.unique(pos1)
pos1.shape









    Out[2]:





(6320937,)



In [3]:

    
pos2 = np.random.randint(0, 10000000, size=10000000)
pos2 = np.unique(pos2)
pos2.shape









    Out[3]:





(6322189,)



In [4]:

    
pos1









    Out[4]:





array([      0,       1,       2, ..., 9999996, 9999998, 9999999])



In [5]:

    
pos2









    Out[5]:





array([      2,       4,       6, ..., 9999994, 9999997, 9999999])



In [6]:

    
def intersect_positions_impl1(pos1, pos2):
    cond1 = np.in1d(pos1, pos2)
    cond2 = np.in1d(pos2, pos1)
    return cond1, cond2



In [7]:

    
cond1, cond2 = intersect_positions_impl1(pos1, pos2)



In [8]:

    
pos1[cond1]









    Out[8]:





array([      2,       4,       6, ..., 9999993, 9999994, 9999999])



In [9]:

    
pos2[cond2]









    Out[9]:





array([      2,       4,       6, ..., 9999993, 9999994, 9999999])



In [10]:

    
np.count_nonzero(cond1)









    Out[10]:





3997238



In [11]:

    
%timeit intersect_positions_impl1(pos1, pos2)









    



1 loops, best of 3: 3.35 s per loop



In [12]:

    
%memit intersect_positions_impl1(pos1, pos2)









    



peak memory: 835.43 MiB, increment: 578.62 MiB



In [13]:

    
%%cython

import numpy as np
cimport numpy as np

def intersect_positions_impl2(np.ndarray[np.int64_t, ndim=1] pos1, 
                              np.ndarray[np.int64_t, ndim=1] pos2):
    cdef int i = 0
    cdef int j = 0
    cdef np.ndarray[np.uint8_t, ndim=1] cond1
    cdef np.ndarray[np.uint8_t, ndim=1] cond2
    cond1 = np.zeros((pos1.size, ), dtype=np.uint8)
    cond2 = np.zeros((pos2.size, ), dtype=np.uint8)
    cdef int m = pos1.size
    cdef int n = pos2.size
    
    while i < m and j < n:
        if pos1[i] < pos2[j]:
            # advance left
            i += 1
        elif pos1[i] > pos2[j]:
            # advance right
            j += 1
        else:
            # match
            cond1[i] = 1
            cond2[j] = 1
            # advance both
            i += 1
            j += 1
            
    return cond1.astype('b1'), cond2.astype('b1')



In [14]:

    
cond1, cond2 = intersect_positions_impl2(pos1, pos2)



In [15]:

    
pos1[cond1]









    Out[15]:





array([      2,       4,       6, ..., 9999993, 9999994, 9999999])



In [16]:

    
pos2[cond2]









    Out[16]:





array([      2,       4,       6, ..., 9999993, 9999994, 9999999])



In [17]:

    
np.count_nonzero(cond1)









    Out[17]:





3997238



In [18]:

    
%timeit intersect_positions_impl2(pos1, pos2)









    



10 loops, best of 3: 95.5 ms per loop



In [19]:

    
%memit intersect_positions_impl2(pos1, pos2)









    



peak memory: 300.25 MiB, increment: 0.00 MiB



In [20]:

    
pos1.nbytes / 1.e6









    Out[20]:





50.567496

Block apply



In [21]:

    
# dev imports
import sys
import numpy as np
%reload_ext memory_profiler
sys.path.insert(0, '../src')
%reload_ext autoreload
%autoreload 1
%aimport anhima.gt
%aimport anhima.af
%aimport anhima.util



In [22]:

    
genotypes = np.random.randint(0, 2, size=(100000, 1000, 2)).astype('i1')
genotypes.shape, genotypes.nbytes / 1e6









    Out[22]:





((100000, 1000, 2), 200.0)



In [23]:

    
gn = anhima.gt.as_n_alt(genotypes)



In [24]:

    
gn.shape, gn.nbytes / 1e6









    Out[24]:





((100000, 1000), 100.0)



In [25]:

    
gn2 = anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)



In [26]:

    
gn2.shape, gn2.nbytes / 1e6









    Out[26]:





((100000, 1000), 100.0)



In [27]:

    
np.array_equal(gn, gn2)









    Out[27]:





True



In [28]:

    
ac = anhima.af.allele_counts(genotypes)



In [29]:

    
ac.shape









    Out[29]:





(100000, 2)



In [30]:

    
ac2 = anhima.util.block_apply(anhima.af.allele_counts, genotypes, axis=0, block_size=1001)



In [31]:

    
ac2.shape









    Out[31]:





(100000, 2)



In [32]:

    
np.array_equal(ac, ac2)









    Out[32]:





True



In [33]:

    
%timeit anhima.gt.as_n_alt(genotypes)









    



1 loops, best of 3: 1.46 s per loop



In [34]:

    
%timeit anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)









    



1 loops, best of 3: 1.47 s per loop



In [35]:

    
%timeit anhima.af.allele_counts(genotypes)









    



1 loops, best of 3: 1.06 s per loop



In [36]:

    
%timeit anhima.util.block_apply(anhima.af.allele_counts, genotypes, axis=0, block_size=1001)









    



1 loops, best of 3: 1.1 s per loop



In [37]:

    
%memit anhima.gt.as_n_alt(genotypes)









    



peak memory: 989.54 MiB, increment: 285.96 MiB



In [38]:

    
%memit anhima.util.block_apply(anhima.gt.as_n_alt, genotypes, axis=0, block_size=1001)









    



peak memory: 797.34 MiB, increment: 93.76 MiB



In [39]:

    
packed = anhima.gt.pack_diploid(genotypes)



In [40]:

    
packed2 = anhima.util.block_apply(anhima.gt.pack_diploid, genotypes, axis=0, block_size=1001)



In [41]:

    
np.array_equal(packed, packed2)









    Out[41]:





True



In [42]:

    
%memit anhima.gt.pack_diploid(genotypes)









    



peak memory: 1371.17 MiB, increment: 476.85 MiB



In [43]:

    
%memit anhima.util.block_apply(anhima.gt.pack_diploid, genotypes, axis=0, block_size=1001)









    



peak memory: 987.47 MiB, increment: 93.15 MiB



In [44]:

    
packed.nbytes / 1e6









    Out[44]:





100.0



In [45]:

    
unpacked = anhima.gt.unpack_diploid(packed)



In [46]:

    
unpacked2 = anhima.util.block_apply(anhima.gt.unpack_diploid, packed2, axis=0, block_size=1001)



In [47]:

    
np.array_equal(unpacked, unpacked2)









    Out[47]:





True



In [48]:

    
%memit anhima.gt.unpack_diploid(packed)









    



peak memory: 1632.62 MiB, increment: 356.82 MiB



In [49]:

    
%memit anhima.util.block_apply(anhima.gt.unpack_diploid, packed2, axis=0, block_size=1001)









    



peak memory: 1463.05 MiB, increment: 187.25 MiB



In [ ]: