In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import h5py
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
import allel; print(allel.__version__)
%aimport allel.stats.selection


0.21.0.dev0

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/haplotypes/main/hdf5/ag1000g.phase1.ar3.haplotypes.3R.h5',
                    mode='r')
callset


Out[3]:
<HDF5 file "ag1000g.phase1.ar3.haplotypes.3R.h5" (mode r)>

In [4]:
n_variants = 1000000
n_samples = 50

In [5]:
h1 = allel.GenotypeArray(callset['3R']['calldata/genotype'][:n_variants, :n_samples]).to_haplotypes()
h1


Out[5]:
HaplotypeArray((1000000, 100), dtype=int8)
0 1 2 3 4 ... 95 96 97 98 99
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
3 0 0 0 0 0 ... 0 0 0 0 0
4 0 0 0 0 0 ... 0 0 0 0 0

...


In [6]:
h2 = allel.GenotypeArray(callset['3R']['calldata/genotype'][:n_variants, -n_samples:]).to_haplotypes()
h2


Out[6]:
HaplotypeArray((1000000, 100), dtype=int8)
0 1 2 3 4 ... 95 96 97 98 99
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
3 0 0 0 0 0 ... 0 0 0 0 0
4 0 0 0 0 0 ... 0 0 0 0 0

...


In [7]:
pos = callset['3R']['variants/POS'][:n_variants]
pos


Out[7]:
array([   1252,    1262,    1271, ..., 3808745, 3808748, 3808749], dtype=int32)

In [8]:
ac1 = h1.count_alleles(max_allele=1)
ac2 = h2.count_alleles(max_allele=1)
ac = allel.AlleleCountsArray(ac1 + ac2)
is_seg = ac.is_segregating() & (ac.min(axis=1) > 1)
h1_seg = h1.compress(is_seg, axis=0)
h2_seg = h2.compress(is_seg, axis=0)
pos_seg = pos.compress(is_seg)
ac_seg = ac.compress(is_seg, axis=0)
ac1_seg = ac1.compress(is_seg, axis=0)
ac2_seg = ac2.compress(is_seg, axis=0)
np.count_nonzero(is_seg)


Out[8]:
206253

In [9]:
%%time
score = allel.stats.xpnsl(h1_seg, h2_seg, use_threads=False)


CPU times: user 5.57 s, sys: 4 ms, total: 5.58 s
Wall time: 5.57 s

In [10]:
%%time
score_threaded = allel.stats.xpnsl(h1_seg, h2_seg, use_threads=True)


CPU times: user 6.19 s, sys: 8 ms, total: 6.2 s
Wall time: 1.61 s

In [11]:
score


Out[11]:
array([-0.96187103, -0.9148735 , -0.94841577, ..., -0.16637026,
       -0.17471978, -0.1129918 ])

In [12]:
score_threaded


Out[12]:
array([-0.96187103, -0.9148735 , -0.94841577, ..., -0.16637026,
       -0.17471978, -0.1129918 ])

In [14]:
np.count_nonzero(np.isnan(score)), np.count_nonzero(~np.isnan(score))


Out[14]:
(0, 206253)

In [15]:
np.count_nonzero(np.isinf(score)), np.count_nonzero(~np.isinf(score))


Out[15]:
(0, 206253)

In [16]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPNSL score');



In [17]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20)
plt.colorbar();



In [18]:
import scipy as sp
import scipy.stats

In [19]:
bins = np.arange(0, n_samples*2 + 1, 10)
mean_by_aac, _, _, _ = sp.stats.binned_statistic_2d(ac1_seg[:, 1], ac2_seg[:, 1], score, 
                                                                  statistic=np.nanmean, bins=bins)
std_by_aac, _, _, _ = sp.stats.binned_statistic_2d(ac1_seg[:, 1], ac2_seg[:, 1], score, 
                                                                statistic=np.nanstd, bins=bins)

In [20]:
hst, _, _ = np.histogram2d(ac1_seg[:, 1], ac2_seg[:, 1], bins=bins)

In [21]:
hst[-1, -1]


Out[21]:
3178.0

In [22]:
hst[0, -1]


Out[22]:
0.0

In [23]:
mean_by_aac[2, -1]


Out[23]:
nan

In [24]:
bins


Out[24]:
array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [25]:
mean_by_aac.shape


Out[25]:
(10, 10)

In [26]:
mean_by_aac.shape, std_by_aac.shape


Out[26]:
((10, 10), (10, 10))

In [27]:
plt.imshow(mean_by_aac.T, interpolation='none');



In [28]:
plt.imshow(std_by_aac.T, interpolation='none');



In [29]:
score_centred = np.zeros_like(score)
score_normed = np.zeros_like(score)

In [30]:
bins


Out[30]:
array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [31]:
score.shape


Out[31]:
(206253,)

In [32]:
score_centred.shape


Out[32]:
(206253,)

In [33]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
for i in range(len(bins) - 1):
    for j in range(len(bins) - 1):
        x1 = bins[i]
        x2 = bins[i + 1]
        y1 = bins[j]
        y2 = bins[j + 1]
        if i+1 == len(bins) - 1:
            locx = (x >= x1) & (x <= x2)
        else:
            locx = (x >= x1) & (x < x2)
        if j+1 == len(bins) - 1:
            locy = (y >= y1) & (y <= y2)
        else:
            locy = (y >= y1) & (y < y2)
        loc = locx & locy
        if np.count_nonzero(loc):
            m = mean_by_aac[i, j]
            s = std_by_aac[i, j]
            print(i, j, x1, x2, y1, y2, np.count_nonzero(loc), m, s)
            score_centred[loc] = score[loc] - m
            score_normed[loc] = score_centred[loc] / s


0 0 0 10 0 10 152560 -0.710867465073 0.309673028295
0 1 0 10 10 20 7120 -0.718605130945 0.329675768725
0 2 0 10 20 30 450 -0.751805457778 0.339006496755
0 3 0 10 30 40 40 -0.556818136318 0.328259728803
1 0 10 20 0 10 5915 -0.893925281745 0.311762662722
1 1 10 20 10 20 7955 -0.819150275846 0.331287171359
1 2 10 20 20 30 2554 -0.802518189524 0.346691492093
1 3 10 20 30 40 463 -0.832737127644 0.369577475782
1 4 10 20 40 50 70 -0.795534601552 0.317084923747
1 5 10 20 50 60 7 -0.432224377379 0.422484076502
2 0 20 30 0 10 433 -1.01468579804 0.297975955611
2 1 20 30 10 20 2212 -0.952383018572 0.335154571504
2 2 20 30 20 30 3018 -0.913216743257 0.342688320713
2 3 20 30 30 40 1408 -0.903787166357 0.362255658769
2 4 20 30 40 50 361 -0.885548818357 0.33957084487
2 5 20 30 50 60 97 -0.90357687258 0.302364742867
2 6 20 30 60 70 23 -0.665908031357 0.419681110405
3 0 30 40 0 10 83 -0.898723894104 0.285920054533
3 1 30 40 10 20 333 -1.04309992133 0.309917232448
3 2 30 40 20 30 1193 -1.03613495902 0.35137067762
3 3 30 40 30 40 1634 -0.976245650263 0.366557562429
3 4 30 40 40 50 939 -0.976881331272 0.360776706043
3 5 30 40 50 60 332 -0.975053968664 0.333843660119
3 6 30 40 60 70 135 -0.926525925044 0.275929934081
3 7 30 40 70 80 57 -0.705999285599 0.289989023543
3 8 30 40 80 90 9 -0.973205451315 0.284998138722
4 0 40 50 0 10 19 -1.05238159649 0.235162943342
4 1 40 50 10 20 106 -0.982799817876 0.294722468929
4 2 40 50 20 30 221 -1.07924928695 0.324960696482
4 3 40 50 30 40 864 -1.02883016677 0.359982936579
4 4 40 50 40 50 1124 -1.01696120897 0.360560736201
4 5 40 50 50 60 701 -1.02056039386 0.360275751447
4 6 40 50 60 70 287 -1.04801000631 0.337496492225
4 7 40 50 70 80 122 -0.959166491259 0.2942767879
4 8 40 50 80 90 497 -0.714105837485 0.197841416638
4 9 40 50 90 100 10 -0.74402060689 0.188217234668
5 0 50 60 0 10 1 -1.18573404732 0.0
5 1 50 60 10 20 11 -0.962431946376 0.274885433387
5 2 50 60 20 30 52 -1.00690107469 0.373406633994
5 3 50 60 30 40 166 -1.03874030583 0.390460217259
5 4 50 60 40 50 578 -1.00838279659 0.359895060511
5 5 50 60 50 60 868 -1.03105812679 0.36047816259
5 6 50 60 60 70 574 -1.04005790969 0.344107546182
5 7 50 60 70 80 206 -1.16715052431 0.27465744449
5 8 50 60 80 90 103 -0.963943708754 0.271724118175
5 9 50 60 90 100 49 -0.805162805938 0.285941553397
6 2 60 70 20 30 3 -1.30386443524 0.149416489883
6 3 60 70 30 40 31 -0.990790856525 0.37213718851
6 4 60 70 40 50 133 -1.01533146369 0.367157117607
6 5 60 70 50 60 463 -0.948716122729 0.359271104271
6 6 60 70 60 70 834 -0.982660924454 0.363385289506
6 7 60 70 70 80 534 -1.05978157156 0.336189370652
6 8 60 70 80 90 211 -1.09904187094 0.272476608064
6 9 60 70 90 100 29 -1.16322610714 0.28660640748
7 3 70 80 30 40 5 -1.0025968605 0.322561527901
7 4 70 80 40 50 10 -0.804397458244 0.269435332346
7 5 70 80 50 60 97 -0.821706442763 0.36950287265
7 6 70 80 60 70 469 -0.869566428485 0.390126689866
7 7 70 80 70 80 861 -0.916462575221 0.337048911301
7 8 70 80 80 90 612 -1.00604331871 0.321837655679
7 9 70 80 90 100 171 -0.998510974189 0.274076394633
8 5 80 90 50 60 17 -0.689066766286 0.353723454686
8 6 80 90 60 70 65 -0.834663936187 0.389714566051
8 7 80 90 70 80 440 -0.803758767674 0.347116924352
8 8 80 90 80 90 971 -0.842726587334 0.347553837965
8 9 80 90 90 100 698 -0.918872101634 0.305591539584
9 7 90 100 70 80 41 -0.580702161694 0.362436801262
9 8 90 100 80 90 420 -0.691039752412 0.344122598996
9 9 90 100 90 100 3178 -0.758847071106 0.310946998388

In [34]:
np.count_nonzero(np.isnan(score_centred))


Out[34]:
0

In [35]:
np.count_nonzero(np.isnan(score_normed))


Out[35]:
1

In [36]:
plt.hist(score[~np.isnan(score)], bins=20);



In [37]:
plt.hist(score_centred[~np.isnan(score_centred)], bins=20);



In [38]:
plt.hist(score_normed[~np.isnan(score_normed)], bins=20);



In [39]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score_centred
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20, vmin=-2, vmax=2)
plt.colorbar();



In [40]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score_normed
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20, vmin=-2, vmax=2)
plt.colorbar();



In [41]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score_centred, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('centred XPNSL score')
plt.ylim(-2, 2);



In [42]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score_normed, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('standardized XPNSL score');



In [43]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPNSL score');



In [13]:
cProfile.run('allel.stats.xpnsl(h1_seg, h2_seg, use_threads=False)', sort='time')


         122 function calls in 5.759 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        4    5.746    1.437    5.746    1.437 stats.pyx:576(nsl_scan_int8)
        1    0.012    0.012    5.759    5.759 selection.py:480(xpnsl)
        8    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.array}
        1    0.000    0.000    5.759    5.759 <string>:1(<module>)
        1    0.000    0.000    5.759    5.759 {built-in method builtins.exec}
       16    0.000    0.000    0.000    0.000 stringsource:341(__cinit__)
        4    0.000    0.000    0.000    0.000 stringsource:985(memoryview_fromslice)
        4    0.000    0.000    5.746    1.437 {allel.opt.stats.nsl_scan_int8}
        6    0.000    0.000    0.000    0.000 numeric.py:414(asarray)
        2    0.000    0.000    0.000    0.000 ndarray.py:1810(__getitem__)
        2    0.000    0.000    0.000    0.000 ndarray.py:1776(__new__)
        4    0.000    0.000    0.000    0.000 stringsource:962(__dealloc__)
       12    0.000    0.000    0.000    0.000 stringsource:643(memoryview_cwrapper)
        4    0.000    0.000    0.000    0.000 ndarray.py:1783(__array_finalize__)
        2    0.000    0.000    0.000    0.000 {method 'view' of 'numpy.ndarray' objects}
       12    0.000    0.000    0.000    0.000 stringsource:649(memoryview_check)
        3    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}
       16    0.000    0.000    0.000    0.000 stringsource:368(__dealloc__)
        4    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
        4    0.000    0.000    0.000    0.000 ndarray.py:1765(_check_input_data)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:996(_handle_fromlist)
        4    0.000    0.000    0.000    0.000 stringsource:507(__getbuffer__)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        4    0.000    0.000    0.000    0.000 stringsource:545(__get__)
        2    0.000    0.000    0.000    0.000 {method 'setdefault' of 'dict' objects}



In [64]:
xpehh_min0 = allel.stats.xpehh(h1_seg, h2_seg, pos_seg, min_ehh=0.0, include_edges=True)

In [65]:
plt.figure(figsize=(8, 8))
plt.plot(xpehh_min0, score, marker='o', linestyle=' ', mfc='none', alpha=.5)
plt.xlabel('unstandardized XPEHH (min_ehh=0)')
plt.ylabel('unstandardized XPNSL')
plt.grid(axis='both');



In [66]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, xpehh_min0, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPEHH score (min_ehh=0)');



In [67]:
xpehh = allel.stats.xpehh(h1_seg, h2_seg, pos_seg, min_ehh=0.05, include_edges=True)

In [68]:
plt.figure(figsize=(8, 8))
plt.plot(xpehh, score, marker='o', linestyle=' ', mfc='none', alpha=.5)
plt.xlabel('unstandardized XPEHH')
plt.ylabel('unstandardized XPNSL')
plt.grid(axis='both');



In [69]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, xpehh, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPEHH score (min_ehh=0)');



In [ ]: