In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import h5py
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
import allel; print(allel.__version__)
%aimport allel.stats.selection
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/haplotypes/main/hdf5/ag1000g.phase1.ar3.haplotypes.3R.h5',
mode='r')
callset
Out[3]:
In [4]:
n_variants = 1000000
n_samples = 50
In [5]:
h1 = allel.GenotypeArray(callset['3R']['calldata/genotype'][:n_variants, :n_samples]).to_haplotypes()
h1
Out[5]:
In [6]:
h2 = allel.GenotypeArray(callset['3R']['calldata/genotype'][:n_variants, -n_samples:]).to_haplotypes()
h2
Out[6]:
In [7]:
pos = callset['3R']['variants/POS'][:n_variants]
pos
Out[7]:
In [8]:
ac1 = h1.count_alleles(max_allele=1)
ac2 = h2.count_alleles(max_allele=1)
ac = allel.AlleleCountsArray(ac1 + ac2)
is_seg = ac.is_segregating() & (ac.min(axis=1) > 1)
h1_seg = h1.compress(is_seg, axis=0)
h2_seg = h2.compress(is_seg, axis=0)
pos_seg = pos.compress(is_seg)
ac_seg = ac.compress(is_seg, axis=0)
ac1_seg = ac1.compress(is_seg, axis=0)
ac2_seg = ac2.compress(is_seg, axis=0)
np.count_nonzero(is_seg)
Out[8]:
In [9]:
%%time
score = allel.stats.xpnsl(h1_seg, h2_seg, use_threads=False)
In [10]:
%%time
score_threaded = allel.stats.xpnsl(h1_seg, h2_seg, use_threads=True)
In [11]:
score
Out[11]:
In [12]:
score_threaded
Out[12]:
In [14]:
np.count_nonzero(np.isnan(score)), np.count_nonzero(~np.isnan(score))
Out[14]:
In [15]:
np.count_nonzero(np.isinf(score)), np.count_nonzero(~np.isinf(score))
Out[15]:
In [16]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPNSL score');
In [17]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20)
plt.colorbar();
In [18]:
import scipy as sp
import scipy.stats
In [19]:
bins = np.arange(0, n_samples*2 + 1, 10)
mean_by_aac, _, _, _ = sp.stats.binned_statistic_2d(ac1_seg[:, 1], ac2_seg[:, 1], score,
statistic=np.nanmean, bins=bins)
std_by_aac, _, _, _ = sp.stats.binned_statistic_2d(ac1_seg[:, 1], ac2_seg[:, 1], score,
statistic=np.nanstd, bins=bins)
In [20]:
hst, _, _ = np.histogram2d(ac1_seg[:, 1], ac2_seg[:, 1], bins=bins)
In [21]:
hst[-1, -1]
Out[21]:
In [22]:
hst[0, -1]
Out[22]:
In [23]:
mean_by_aac[2, -1]
Out[23]:
In [24]:
bins
Out[24]:
In [25]:
mean_by_aac.shape
Out[25]:
In [26]:
mean_by_aac.shape, std_by_aac.shape
Out[26]:
In [27]:
plt.imshow(mean_by_aac.T, interpolation='none');
In [28]:
plt.imshow(std_by_aac.T, interpolation='none');
In [29]:
score_centred = np.zeros_like(score)
score_normed = np.zeros_like(score)
In [30]:
bins
Out[30]:
In [31]:
score.shape
Out[31]:
In [32]:
score_centred.shape
Out[32]:
In [33]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
for i in range(len(bins) - 1):
for j in range(len(bins) - 1):
x1 = bins[i]
x2 = bins[i + 1]
y1 = bins[j]
y2 = bins[j + 1]
if i+1 == len(bins) - 1:
locx = (x >= x1) & (x <= x2)
else:
locx = (x >= x1) & (x < x2)
if j+1 == len(bins) - 1:
locy = (y >= y1) & (y <= y2)
else:
locy = (y >= y1) & (y < y2)
loc = locx & locy
if np.count_nonzero(loc):
m = mean_by_aac[i, j]
s = std_by_aac[i, j]
print(i, j, x1, x2, y1, y2, np.count_nonzero(loc), m, s)
score_centred[loc] = score[loc] - m
score_normed[loc] = score_centred[loc] / s
In [34]:
np.count_nonzero(np.isnan(score_centred))
Out[34]:
In [35]:
np.count_nonzero(np.isnan(score_normed))
Out[35]:
In [36]:
plt.hist(score[~np.isnan(score)], bins=20);
In [37]:
plt.hist(score_centred[~np.isnan(score_centred)], bins=20);
In [38]:
plt.hist(score_normed[~np.isnan(score_normed)], bins=20);
In [39]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score_centred
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20, vmin=-2, vmax=2)
plt.colorbar();
In [40]:
x = ac1_seg[:, 1]
y = ac2_seg[:, 1]
C = score_normed
plt.figure(figsize=(10, 8))
plt.hexbin(x, y, C, gridsize=20, vmin=-2, vmax=2)
plt.colorbar();
In [41]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score_centred, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('centred XPNSL score')
plt.ylim(-2, 2);
In [42]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score_normed, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('standardized XPNSL score');
In [43]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPNSL score');
In [13]:
cProfile.run('allel.stats.xpnsl(h1_seg, h2_seg, use_threads=False)', sort='time')
In [64]:
xpehh_min0 = allel.stats.xpehh(h1_seg, h2_seg, pos_seg, min_ehh=0.0, include_edges=True)
In [65]:
plt.figure(figsize=(8, 8))
plt.plot(xpehh_min0, score, marker='o', linestyle=' ', mfc='none', alpha=.5)
plt.xlabel('unstandardized XPEHH (min_ehh=0)')
plt.ylabel('unstandardized XPNSL')
plt.grid(axis='both');
In [66]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, xpehh_min0, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPEHH score (min_ehh=0)');
In [67]:
xpehh = allel.stats.xpehh(h1_seg, h2_seg, pos_seg, min_ehh=0.05, include_edges=True)
In [68]:
plt.figure(figsize=(8, 8))
plt.plot(xpehh, score, marker='o', linestyle=' ', mfc='none', alpha=.5)
plt.xlabel('unstandardized XPEHH')
plt.ylabel('unstandardized XPNSL')
plt.grid(axis='both');
In [69]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, xpehh, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised XPEHH score (min_ehh=0)');
In [ ]: