In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import h5py
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
import allel; print(allel.__version__)
%aimport allel.stats.selection
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/haplotypes/main/hdf5/ag1000g.phase1.ar3.haplotypes.3R.h5',
mode='r')
callset
Out[3]:
In [4]:
n_variants = 500000
n_samples = 100
h = allel.GenotypeArray(callset['3R']['calldata/genotype'][:n_variants, :n_samples]).to_haplotypes()
h
Out[4]:
In [5]:
pos = callset['3R']['variants/POS'][:n_variants]
pos
Out[5]:
In [6]:
ac = h.count_alleles(max_allele=1)
is_seg = ac.is_segregating() & (ac.min(axis=1) > 1)
h_seg = h.compress(is_seg, axis=0)
pos_seg = pos.compress(is_seg)
ac_seg = ac.compress(is_seg, axis=0)
np.count_nonzero(is_seg)
Out[6]:
In [7]:
%%time
score_0min = allel.stats.ihs(h_seg, pos_seg, min_ehh=0, min_maf=0, include_edges=True, use_threads=False)
In [8]:
%%time
score = allel.stats.ihs(h_seg, pos_seg, min_ehh=0.05, min_maf=0, include_edges=True, use_threads=False)
In [9]:
%%time
score_threaded = allel.stats.ihs(h_seg, pos_seg, min_ehh=0.05, min_maf=0, include_edges=True, use_threads=True)
In [10]:
score_0min
Out[10]:
In [11]:
score
Out[11]:
In [12]:
score_threaded
Out[12]:
In [13]:
cProfile.run('allel.stats.ihs(h_seg[:50000], pos_seg[:50000], min_ehh=0.05, min_maf=0, include_edges=True, use_threads=False)', sort='time')
In [14]:
np.count_nonzero(np.isnan(score)), np.count_nonzero(~np.isnan(score))
Out[14]:
In [15]:
np.count_nonzero(np.isinf(score)), np.count_nonzero(~np.isinf(score))
Out[15]:
In [16]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('position (bp)')
plt.ylabel('unstandardised iHS score')
plt.autoscale(axis='x', tight=True);
In [17]:
plt.figure(figsize=(16, 6))
plt.plot(ac_seg[:, 1], score, linestyle=' ', marker='o', mfc='none')
plt.xlabel('alternate allele count')
plt.ylabel('unstandardised iHS score')
plt.grid(axis='y');
In [19]:
score_standardized, ac_bins = allel.stats.standardize_by_allele_count(score, ac_seg[:, 1])
In [20]:
plt.figure(figsize=(16, 6))
plt.plot(ac_seg[:, 1], score_standardized, linestyle=' ', marker='o', mfc='none')
plt.xlabel('Alternate allele count')
plt.ylabel('Standardised IHS score')
plt.grid(axis='y');
In [21]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, score_standardized, linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('Position (bp)')
plt.ylabel('Standardised IHS score')
plt.autoscale(axis='x', tight=True);
In [22]:
plt.figure(figsize=(16, 4))
plt.plot(pos_seg, np.abs(score_standardized), linestyle=' ', marker='o', mfc='none')
plt.grid(axis='y')
plt.xlabel('Position (bp)')
plt.ylabel('|Standardised iHS score|')
plt.autoscale(axis='x', tight=True);
In [23]:
h_seg.shape
Out[23]:
In [61]:
loc_variants = slice(4000000, 9000000, 1)
n_samples = 50
h = allel.GenotypeArray(callset['3R']['calldata/genotype'][loc_variants, :n_samples]).to_haplotypes()
h
Out[61]:
In [62]:
pos = callset['3R']['variants/POS'][loc_variants]
pos
Out[62]:
In [63]:
ac = h.count_alleles(max_allele=1)
is_seg = ac.is_segregating() & (ac.min(axis=1) > 5)
h_seg = h.compress(is_seg, axis=0)
pos_seg = pos.compress(is_seg)
ac_seg = ac.compress(is_seg, axis=0)
np.count_nonzero(is_seg)
Out[63]:
In [64]:
def plot_score_gap(score, pos, ylim=(-12, 12)):
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.plot(pos, score, linestyle=' ', marker='o', mfc='none', markersize=2)
ax.grid(axis='y')
ax.set_xlabel('position (bp)')
ax.set_ylabel('score')
ax.set_ylim(*ylim)
ax = ax.twinx()
x = (pos[:-1] + pos[1:]) / 2
y = np.diff(pos)
ax.plot(x, y)
ax.set_ylabel('gap size (bp)')
ax.autoscale(axis='x', tight=True);
In [65]:
accessibility = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/accessibility/accessibility.h5', mode='r')
is_accessible = accessibility['3R']['is_accessible'][:]
is_accessible
Out[65]:
In [76]:
%%time
score_unadjusted = allel.stats.ihs(h_seg, pos_seg, include_edges=True, max_gap=None, gap_scale=None)
In [77]:
np.count_nonzero(np.isnan(score_unadjusted)), np.count_nonzero(~np.isnan(score_unadjusted))
Out[77]:
In [78]:
score_unadjusted, _ = allel.stats.standardize_by_allele_count(score_unadjusted, ac_seg[:, 1])
In [79]:
score_gap_adjusted = allel.stats.ihs(h_seg, pos_seg, max_gap=200000, gap_scale=1000)
score_gap_adjusted, _ = allel.stats.standardize_by_allele_count(score_gap_adjusted, ac_seg[:, 1])
In [80]:
score_access_adjusted = allel.stats.ihs(h_seg, pos_seg, min_ehh=0.05, max_gap=None, gap_scale=None,
is_accessible=is_accessible)
score_access_adjusted, _ = allel.stats.standardize_by_allele_count(score_access_adjusted, ac_seg[:, 1])
In [81]:
plot_score_gap(score_unadjusted, pos_seg)
In [82]:
plot_score_gap(score_gap_adjusted, pos_seg)
In [83]:
plot_score_gap(score_access_adjusted, pos_seg)
In [84]:
plot_score_gap(np.abs(score_access_adjusted), pos_seg, ylim=(0, 12))
In [85]:
plot_score_gap(score_gap_adjusted - score_unadjusted, pos_seg, ylim=(-6, 6))
In [86]:
plot_score_gap(score_access_adjusted - score_unadjusted, pos_seg, ylim=(-6, 6))
In [87]:
plt.figure(figsize=(8, 8))
plt.plot(score_unadjusted, score_gap_adjusted, marker='o', mfc='none', linestyle=' ');
In [88]:
plt.figure(figsize=(8, 8))
plt.plot(score_unadjusted, score_access_adjusted, marker='o', mfc='none', linestyle=' ');
In [89]:
plt.figure(figsize=(8, 8))
plt.plot(score_gap_adjusted, score_access_adjusted, marker='o', mfc='none', linestyle=' ');
In [90]:
loc_variants = slice(4000000, 9000000, 1)
n_samples = 50
h = allel.GenotypeArray(callset['3R']['calldata/genotype'][loc_variants, :n_samples]).to_haplotypes()
h
Out[90]:
In [97]:
pos = callset['3R']['variants/POS'][loc_variants]
pos
Out[97]:
In [98]:
ac = h.count_alleles(max_allele=1)
is_seg = ac.is_segregating() & (ac.min(axis=1) > 0)
h_seg = h.compress(is_seg, axis=0)
pos_seg = pos.compress(is_seg)
ac_seg = ac.compress(is_seg, axis=0)
np.count_nonzero(is_seg)
Out[98]:
In [99]:
%%time
score = allel.stats.ihs(h_seg, pos_seg, min_maf=0.05, max_gap=200000, gap_scale=1000)
In [100]:
np.count_nonzero(np.isnan(score)), np.count_nonzero(~np.isnan(score))
Out[100]:
In [101]:
score_std, _ = allel.stats.standardize_by_allele_count(score, ac_seg[:, 1])
In [102]:
plot_score_gap(score_std, pos_seg)
In [103]:
plot_score_gap(np.abs(score_std), pos_seg, ylim=(0, 12))
In [ ]: