In [2]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
# import imp
# import logging
# imp.reload(logging)
# logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.stats.fst
%aimport allel.stats
%aimport allel.plot
%aimport allel.opt.stats

In [4]:
g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
     [[0, 1], [0, 1], [0, 1], [0, 1]],
     [[0, 0], [0, 0], [0, 0], [0, 0]],
     [[0, 1], [1, 2], [1, 1], [2, 2]],
     [[0, 0], [1, 1], [0, 1], [-1, -1]]]
subpops = [[0, 1], [2, 3]]
g = allel.GenotypeArray(g)
ac1 = g.count_alleles(subpop=subpops[0])
ac2 = g.count_alleles(subpop=subpops[1])

In [5]:
a, b, c = allel.stats.weir_cockerham_fst(g, subpops)
a[:, 1], b[:, 1], c[:, 1]


Out[5]:
(array([ 0.5  ,  0.   ,  0.   , -0.125, -0.375]),
 array([ 0.        , -0.25      ,  0.        ,  0.125     ,  0.41666667]),
 array([ 0.        ,  0.5       ,  0.        ,  0.25      ,  0.16666667]))

In [6]:
a[:, 1] / (a[:, 1] + b[:, 1] + c[:, 1])


Out[6]:
array([ 1. ,  0. ,  nan, -0.5, -1.8])

In [7]:
a[:, 2] / (a[:, 2] + b[:, 2] + c[:, 2])


Out[7]:
array([ nan,  nan,  nan, -0.5,  nan])

In [8]:
np.sum(a, axis=1) / (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))


Out[8]:
array([ 1. ,  0. ,  nan, -0.4, -1.8])

In [9]:
num, den = allel.stats.hudson_fst(ac1, ac2)
num, den


Out[9]:
(array([ 1.        , -0.16666667,  0.        , -0.125     , -0.33333333]),
 array([ 1.   ,  0.5  ,  0.   ,  0.625,  0.5  ]))

In [10]:
num / den


Out[10]:
array([ 1.        , -0.33333333,         nan, -0.2       , -0.66666667])

In [11]:
g2 = [[[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]],
     [[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]],
     [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
     [[0, 1], [1, 2], [0, 1], [1, 2], [1, 1], [2, 2], [1, 1], [2, 2]],
     [[0, 0], [1, 1], [0, 0], [1, 1], [0, 1], [-1, -1], [0, 1], [-1, -1]]]
subpops2 = [[0, 1, 2, 3], [4, 5, 6, 7]]
g2 = allel.GenotypeArray(g2)
ac21 = g2.count_alleles(subpop=subpops2[0])
ac22 = g2.count_alleles(subpop=subpops2[1])

In [13]:
a, b, c = allel.stats.weir_cockerham_fst(g2, subpops2)
a[:, 1], b[:, 1], c[:, 1]


Out[13]:
(array([ 0.5       ,  0.        ,  0.        , -0.04166667, -0.09375   ]),
 array([ 0.        , -0.25      ,  0.        ,  0.04166667,  0.16666667]),
 array([ 0.        ,  0.5       ,  0.        ,  0.25      ,  0.16666667]))

In [14]:
a[:, 1] / (a[:, 1] + b[:, 1] + c[:, 1])


Out[14]:
array([ 1.        ,  0.        ,         nan, -0.16666667, -0.39130435])

In [15]:
num, den = allel.stats.hudson_fst(ac21, ac22)
num, den


Out[15]:
(array([ 1.        , -0.07142857,  0.        , -0.01785714, -0.11904762]),
 array([ 1.   ,  0.5  ,  0.   ,  0.625,  0.5  ]))

In [16]:
num / den


Out[16]:
array([ 1.        , -0.14285714,         nan, -0.02857143, -0.23809524])

In [17]:
g3 = [[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]],
     [[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]],
     [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
     [[0, 1], [1, 2], [0, 1], [1, 2], [0, 1], [1, 2], [0, 1], [1, 2], [1, 1], [2, 2], [1, 1], [2, 2]],
     [[0, 0], [1, 1], [0, 0], [1, 1], [0, 0], [1, 1], [0, 0], [1, 1], [0, 1], [-1, -1], [0, 1], [-1, -1]]]
subpops3 = [list(range(8)), list(range(8, 12))]
g3 = allel.GenotypeArray(g3)
ac31 = g3.count_alleles(subpop=subpops3[0])
ac32 = g3.count_alleles(subpop=subpops3[1])

In [18]:
a, b, c = allel.stats.weir_cockerham_fst(g3, subpops3)
a[:, 1], b[:, 1], c[:, 1]


Out[18]:
(array([ 0.5     ,  0.      ,  0.      , -0.01875 , -0.078125]),
 array([ 0.        , -0.25      ,  0.        , -0.06666667,  0.2       ]),
 array([ 0.        ,  0.5       ,  0.        ,  0.33333333,  0.1       ]))

In [19]:
a[:, 1] / (a[:, 1] + b[:, 1] + c[:, 1])


Out[19]:
array([ 1.        ,  0.        ,         nan, -0.07563025, -0.35211268])

In [20]:
num, den = allel.stats.hudson_fst(ac31, ac32)
num, den


Out[20]:
(array([ 1.        , -0.05238095,  0.        ,  0.00595238, -0.1       ]),
 array([ 1.   ,  0.5  ,  0.   ,  0.625,  0.5  ]))

In [21]:
num / den


Out[21]:
array([ 1.        , -0.1047619 ,         nan,  0.00952381, -0.2       ])

In [22]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 50000, 1000, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1

In [23]:
g = allel.GenotypeArray(data, copy=False)
g


Out[23]:
GenotypeArray((50000, 1000, 2), dtype=int8)
0 1 2 3 4 ... 995 996 997 998 999
0 0/0 0/0 0/0 0/0 1/1 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/1 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/1 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [24]:
subpops = [np.arange(0, 500), np.arange(500, 1000)]

In [25]:
a, b, c = allel.stats.weir_cockerham_fst(g, subpops)

In [26]:
fst = a[:, 0] / (a[:, 0] + b[:, 0] + c[:, 0])
fst


Out[26]:
array([ -8.83881584e-04,   8.18538485e-04,  -1.09694980e-03, ...,
        -5.14830085e-04,   4.86184077e-05,  -2.62540045e-04])

In [27]:
fst = a[:, 1] / (a[:, 1] + b[:, 1] + c[:, 1])
fst


Out[27]:
array([ -8.83881584e-04,   8.18538485e-04,  -1.09694980e-03, ...,
        -5.14830085e-04,   4.86184077e-05,  -2.62540045e-04])

In [28]:
ac1 = g.count_alleles(subpop=subpops[0])
ac2 = g.count_alleles(subpop=subpops[1])
num, den = allel.stats.hudson_fst(ac1, ac2)
num / den


Out[28]:
array([ -6.95817769e-04,   9.87003824e-04,  -9.58854478e-04, ...,
        -3.38898428e-04,   2.19173920e-04,  -8.53563906e-05])

In [29]:
%timeit allel.stats.weir_cockerham_fst(g, subpops)
%memit allel.stats.weir_cockerham_fst(g, subpops)


1 loop, best of 3: 1.18 s per loop
peak memory: 235.33 MiB, increment: 2.17 MiB

In [30]:
%timeit allel.stats.hudson_fst(ac1, ac2)
%memit allel.stats.hudson_fst(ac1, ac2)


100 loops, best of 3: 8.95 ms per loop
peak memory: 235.04 MiB, increment: 0.00 MiB

In [31]:
gc = allel.GenotypeCArray(data)
gc


Out[31]:
GenotypeCArray((50000, 1000, 2), int8) nbytes := 95.37 MB; cbytes := 18.19 MB; ratio: 5.24 cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
0 1 2 3 4 ... 995 996 997 998 999
0 0/0 0/0 0/0 0/0 1/1 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/1 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/1 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [32]:
%timeit allel.stats.weir_cockerham_fst(gc, subpops)


1 loop, best of 3: 1.56 s per loop

In [33]:
%memit allel.stats.weir_cockerham_fst(gc, subpops)


peak memory: 253.99 MiB, increment: 2.57 MiB

In [34]:
ac, bc, cc = allel.stats.weir_cockerham_fst(gc, subpops)

In [35]:
np.array_equal(a, ac)


Out[35]:
True

In [36]:
np.array_equal(b, bc)


Out[36]:
True

In [37]:
np.array_equal(c, cc)


Out[37]:
True

In [ ]: