Pairwise distance


In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel
%aimport allel.model
%aimport allel.bcolz
%aimport allel.stats
%aimport allel.plot
allel.__version__


Out[1]:
'0.8.0.dev1'

In [2]:
def binarysize(n):
    return humanize.naturalsize(n, binary=True)

In [3]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 500000, 100, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1

In [4]:
g = allel.model.GenotypeArray(data, copy=False)
print(binarysize(g.nbytes))


95.4 MiB

In [5]:
gc = allel.bcolz.GenotypeCArray(data)
print(binarysize(gc.cbytes))


16.0 MiB

In [6]:
gn = g.to_n_alt()
print(binarysize(gn.nbytes))


47.7 MiB

In [7]:
gnc = gc.to_n_alt()
print(binarysize(gnc.cbytes))


13.0 MiB

In [8]:
d1 = allel.stats.pairwise_distance(gn, metric='cityblock')

In [9]:
plt.hist(d1);



In [10]:
allel.plot.pairwise_distance(d1);



In [11]:
d2 = allel.stats.pairwise_distance(gnc, metric='cityblock')

In [12]:
np.array_equal(d1, d2)


Out[12]:
True

In [13]:
%timeit allel.stats.pairwise_distance(gn, metric='cityblock')


1 loops, best of 3: 2.93 s per loop

In [14]:
%timeit allel.stats.pairwise_distance(gnc, metric='cityblock')


1 loops, best of 3: 2.83 s per loop

In [15]:
%memit allel.stats.pairwise_distance(gn, metric='cityblock')


peak memory: 686.46 MiB, increment: 381.48 MiB

In [16]:
%memit allel.stats.pairwise_distance(gnc, metric='cityblock')


peak memory: 305.45 MiB, increment: 0.16 MiB

Multidimensional arrays


In [13]:
import scipy.spatial

In [14]:
d1 = scipy.spatial.distance.pdist(gn[:100].T, 'cityblock')

In [15]:
d2 = allel.stats.pdist(gn[:100], 'cityblock')

In [16]:
d1


Out[16]:
array([ 19.,  18.,  16., ...,  17.,  13.,  18.])

In [17]:
d2


Out[17]:
array([19, 18, 16, ..., 17, 13, 18])

In [18]:
np.array_equal(d1, d2)


Out[18]:
True

In [19]:
%timeit scipy.spatial.distance.pdist(gn[:1000].T, 'cityblock')


100 loops, best of 3: 4.95 ms per loop

In [20]:
%timeit scipy.spatial.distance.pdist(gn[:1000].T, scipy.spatial.distance.cityblock)


10 loops, best of 3: 97.3 ms per loop

In [21]:
%timeit allel.stats.pdist(gn[:1000], scipy.spatial.distance.cityblock)


10 loops, best of 3: 132 ms per loop

In [22]:
gac = g[:1000].to_allele_counts()
gac.shape


Out[22]:
(1000, 100, 2)

In [23]:
pos = np.unique(np.random.randint(0, 1000000, size=10000))[:1000]
pos.shape, pos.min(), pos.max()


Out[23]:
((1000,), 114, 102856)

In [24]:
is_accessible = np.random.randint(0, 2, size=pos.max()).astype('b1')
is_accessible.shape, np.count_nonzero(is_accessible)


Out[24]:
((102856,), 51422)

In [54]:
dxy = lambda ac1, ac2: allel.stats.sequence_divergence(pos, ac1, ac2, is_accessible=is_accessible)

In [55]:
dxy(gac[:, 0], gac[:, 1])


Out[55]:
0.0019763230655399353

In [56]:
d3 = allel.stats.pairwise_distance(gac, dxy)

In [57]:
d3


Out[57]:
array([ 0.00197632,  0.00192765,  0.00215156, ...,  0.00211262,
        0.00198606,  0.00190817])

In [29]:
allel.plot.pairwise_distance(d3);



In [58]:
%timeit allel.stats.pairwise_distance(gac, dxy)


1 loops, best of 3: 891 ms per loop

In [59]:
d4 = allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)
d4


Out[59]:
array([ 0.00197632,  0.00192765,  0.00215156, ...,  0.00211262,
        0.00198606,  0.00190817])

In [62]:
np.array_equal(d3, d4)


Out[62]:
True

In [60]:
%timeit allel.stats.pairwise_dxy(pos, gac)


1 loops, best of 3: 509 ms per loop

In [61]:
%timeit allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)


1 loops, best of 3: 569 ms per loop

In [64]:
%memit allel.stats.pairwise_dxy(pos, gac)


peak memory: 309.01 MiB, increment: 0.00 MiB

In [63]:
%memit allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)


peak memory: 308.82 MiB, increment: 0.13 MiB

In [ ]: