In [1]:
%run setup.ipynb
%matplotlib inline
In [2]:
# load in selected missense variants
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
tbl_variants_selected.nrows()
Out[2]:
In [3]:
# load in haplotypes
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
pos.shape, haps.shape
Out[3]:
In [4]:
def lewontin_d_prime(h, i, j, a=1, b=1):
"""Compute LD between a pair of alleles.
Parameters
----------
h : array
Haplotype array.
i : int
First variant index.
j : int
Second variant index.
a : int
First variant allele.
b : int
Second variant allele.
Returns
-------
ld : float
"""
# setup
h = allel.HaplotypeArray(h)
n_a = n_b = 0 # allele counts
n_ab = 0 # haplotype counts
n = 0 # allele number (i.e., number of calls)
# iterate over haplotypes, counting alleles and haplotypes
for k in range(h.n_haplotypes):
# access alleles
allele_ik = h[i, k]
allele_jk = h[j, k]
# only count if allele non-missing at both sites
if allele_ik < 0 or allele_jk < 0:
continue
# accumulate
if allele_ik == a:
n_a += 1
if allele_jk == b:
n_b += 1
if allele_ik == a and allele_jk == b:
n_ab += 1
n += 1
# log('D_prime counts:', 'i', i, 'j', j, 'a', a, 'b', b, 'n', n, 'n_a', n_a, 'n_b', n_b)
# bail out if no data or either allele is absent or fixed
if n == 0 or n_a == 0 or n_b == 0 or n == n_a or n == n_b:
return None
# N.B., compute D prime using counts rather than frequencies to avoid floating-point errors
# N.B., preserve the sign of D prime to retain information about linkage versus repulsion
# compute coefficient of linkage disequilibrium * n**2
D_ab = (n * n_ab) - (n_a * n_b)
# compute normalisation coefficient * n**2
if D_ab >= 0:
D_max = min(n_a * (n - n_b), (n - n_a) * n_b)
else:
D_max = min(n_a * n_b, (n - n_a) * (n - n_b))
# compute D prime
D_prime = D_ab / D_max
# log('D_prime', D_prime, i, j, a, b, n, n_a, n_b, D_ab, D_max)
# if np.isnan(D_prime):
# log('nan')
# log(D_prime, i, j, a, b, n, n_a, n_b, D_ab, D_max)
return D_prime
In [5]:
pos_selected = allel.SortedIndex(sorted(tbl_variants_selected.values('POS').set()))
pos_selected
Out[5]:
In [6]:
tbl_variants_selected
Out[6]:
In [7]:
pos_selected.shape
Out[7]:
In [8]:
loc_selected = pos.locate_keys(pos_selected)
np.count_nonzero(loc_selected)
Out[8]:
In [9]:
haps_selected = haps[loc_selected]
haps_selected
Out[9]:
In [10]:
ac = haps_selected.count_alleles()
ac.displayall()
In [11]:
def compute_allele_af(ax=None):
global allele_af
recs = list(tbl_variants_selected.records())
n = len(recs)
allele_af = np.zeros(n, dtype='f8')
for i in range(n):
i_pos = recs[i].POS
i_allele = recs[i].ALTIX + 1
i_vidx = pos_selected.locate_key(i_pos)
# log('row', i, i_vidx, i_pos, i_allele)
x = ac[i_vidx, i_allele] * 100 / haps_selected.shape[1]
allele_af[i] = x
compute_allele_af()
In [12]:
def compute_ld():
global ld
recs = list(tbl_variants_selected.records())
n = len(recs)
ld = np.zeros((n, n), dtype='f8')
for i in range(n):
i_pos = recs[i].POS
i_allele = recs[i].ALTIX + 1
i_vidx = pos_selected.locate_key(i_pos)
# log('row', i, i_vidx, i_pos, i_allele)
for j in range(i+1, n):
j_pos = recs[j].POS
j_allele = recs[j].ALTIX + 1
j_vidx = pos_selected.locate_key(j_pos)
# log('col', j, j_vidx, j_pos, j_allele)
v = lewontin_d_prime(haps_selected, i_vidx, j_vidx, i_allele, j_allele)
# log('D_prime', v)
ld[i, j] = v
ld[j, i] = v
compute_ld()
In [13]:
ld[11]
Out[13]:
In [14]:
def plot_allele_af(ax=None, **kwargs):
n = len(allele_af)
if ax is None:
fig, ax = plt.subplots(figsize=(7, 2))
left = np.arange(n) + 0.2
ax.bar(left, allele_af, align='edge', width=0.6, **kwargs)
ax.set_ylabel('Allele frequency (%)')
ax.set_xlim(0, n)
ax.set_xticks([])
ax.set_yticks(range(0, 60, 10))
ax.set_xticklabels([])
plot_allele_af()
In [15]:
def fig_pw_ld():
fig = plt.figure(figsize=(7, 7.3), dpi=120)
gs = mpl.gridspec.GridSpec(2, 2, height_ratios=[1.3, 6], width_ratios=[7, .5])
# sns.despine(ax=ax, offset=5)
#sns.heatmap(ld, vmin=-1, vmax=1, center=0, square=True, ax=ax, cmap='Blues', cbar_kws=dict(ticks=[-1, -.5, 0, .5, 1]))
ax = fig.add_subplot(gs[0, 0])
sns.despine(ax=ax)
plot_allele_af(ax, color='k')
ax = fig.add_subplot(gs[1, 0])
im = ax.pcolormesh(ld, vmin=-1, vmax=1, cmap='Blues', shading='flat', edgecolors='gray', linewidths=.5, antialiased=True)
labels = ['%s:%s>%s %s' % (rec.POS, rec.REF, rec.ALT, rec['AGAP004707-RA'].rjust(6))
for rec in tbl_variants_selected.records()]
# ax.invert_yaxis()
ticks = np.arange(ld.shape[0]) + .5
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(labels, rotation=90, ha='center', va='top', fontdict=dict(family='monospace'))
ax.set_yticklabels(labels, rotation=0, va='center', ha='right', fontdict=dict(family='monospace'));
ax.set_xlim(0, ld.shape[0])
ax.set_ylim(0, ld.shape[0])
ax.xaxis.set_tick_params(length=0)
ax.yaxis.set_tick_params(length=0)
for i in range(ld.shape[0] + 1):
ax.add_patch(plt.Rectangle((i-1, i-1), 1, 1, color='gray'))
cax = fig.add_subplot(gs[1, 1])
fig.colorbar(im, cax=cax, )
# cax.set_title("Linkage disequilibrium (D')", loc='left')
cax.set_ylabel("Linkage disequilibrium (D')", va='top')
fig.tight_layout(pad=0.1)
fig.savefig('../artwork/fig_ld.png', dpi=300, bbox_inches='tight')
fig_pw_ld()
In [ ]: