In [1]:
%run setup.ipynb
%matplotlib inline
%config InlineBackend.figure_formats = {'retina', 'png'}
rcParams['figure.dpi'] = 120
rcParams['figure.facecolor'] = 'w'
In [2]:
# grab the npy array made in ag1000g paper 1 vgsc notebook
p1clus = np.load('../data/hierarchical_cluster_membership.npy')
In [3]:
# make this into a dictionary
p1list = [a.decode("utf-8") for a in p1clus]
p1list = np.asarray(p1list)
np.unique(p1list)[1:], len(p1list)
Out[3]:
In [4]:
nlist = list(np.unique(p1list)[1:])
nlist
Out[4]:
In [5]:
clust_dict = {n: set(np.nonzero(p1list == n)[0]) for n in nlist}
#clust_dict['S2']
In [6]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
tbl_variants_selected
Out[6]:
In [7]:
aa2pos = tbl_variants_selected.lookupone('AGAP004707-RA', 'POS')
In [8]:
aa2pos['R254K']
Out[8]:
In [9]:
aa2pos['L995F']
Out[9]:
In [10]:
aa2pos['L995S']
Out[10]:
In [11]:
aa2pos['I1527T']
Out[11]:
In [12]:
aa2pos['M490I']
Out[12]:
In [13]:
aa2pos['V1254I']
Out[13]:
In [14]:
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']
In [15]:
pos
Out[15]:
In [16]:
def setup_ehh_data():
global haps_ehh
global pos_ehh
# # load haplotypes
# callset_haps = np.load('../data/haps_phase1.npz')
# haps = allel.HaplotypeArray(callset_haps['haplotypes'])
# pos = allel.SortedIndex(callset_haps['POS'])
# ann = callset_haps['ANN']
# perform allele count - needed to locate singletons
ac = haps.count_alleles(max_allele=3)
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann['Annotation'] == b'intergenic_region') |
(ann['Annotation'] == b'intron_variant') |
(ann['Annotation'] == b'downstream_gene_variant') |
(ann['Annotation'] == b'upstream_gene_variant') |
(ann['Annotation'] == b'synonymous_variant') |
(ann['Annotation'] == b'3_prime_UTR_variant') |
(ann['Annotation'] == b'5_prime_UTR_variant')
)
print('neutral', np.count_nonzero(loc_type_neutral), loc_type_neutral.shape)
# locate singletons - will exclude from EHH analysis
# NB the EHH analysis doesn't need the multiallelics
loc_sgl_bi = (ac[:, :2].min(axis=1) == 1) & (ac.is_biallelic_01())
loc_nosgl_bi = (ac[:, :2].min(axis=1) > 1) & (ac.is_biallelic_01())
print('sgl/nosgl', np.count_nonzero(loc_sgl_bi), np.count_nonzero(loc_nosgl_bi), loc_nosgl_bi.shape)
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl_bi
# add back in driver mutations
for aa in 'L995S', 'L995F', 'I1527T':
loc_ehh[pos.locate_key(aa2pos[aa])] = True
print('ehh', np.count_nonzero(loc_ehh), loc_ehh.shape)
haps_ehh = haps[loc_ehh]
pos_ehh = pos[loc_ehh]
setup_ehh_data()
In [17]:
pos_ehh
Out[17]:
In [18]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()
Out[18]:
In [19]:
cluster_labels = sorted(clust_dict)
cluster_labels
Out[19]:
In [20]:
haps_L995F = set(np.nonzero(haps[pos.locate_key(aa2pos['L995F']), :] == 1)[0])
len(haps_L995F)
Out[20]:
In [21]:
haps_L995S = set(np.nonzero(haps[pos.locate_key(aa2pos['L995S']), :] == 1)[0])
len(haps_L995S)
Out[21]:
In [22]:
haps_I1527T = set(np.nonzero(haps[pos.locate_key(aa2pos['I1527T']), :] == 1)[0])
len(haps_I1527T)
Out[22]:
In [23]:
haps_M490I_1 = set(np.nonzero(haps[pos.locate_key(aa2pos['M490I']), :] == 1)[0])
len(haps_M490I_1)
Out[23]:
In [24]:
haps_M490I_2 = set(np.nonzero(haps[pos.locate_key(aa2pos['M490I']), :] == 2)[0])
len(haps_M490I_2)
Out[24]:
In [25]:
target_sets = clust_dict.copy()
wt = set(range(haps_ehh.shape[1]))
for s in clust_dict.values():
wt = wt - s
target_sets['L1'] = haps_I1527T
target_sets['L2'] = haps_M490I_1
wt = wt - target_sets['L1']
wt = wt - target_sets['L2']
target_sets['wt'] = wt
In [26]:
target_labels = (
sorted([l for l in target_sets if l.startswith('F')]) +
sorted([l for l in target_sets if l.startswith('S')]) +
sorted([l for l in target_sets if l.startswith('L')]) +
['wt']
)
for l in target_labels:
print(l, len(target_sets[l]))
In [27]:
def plot_delta_af(core, flank, set1, set2, figsize=(10, 2)):
# subset to region
loc_region = pos_ehh.locate_range(core - flank, core + flank)
pos_region = pos_ehh[loc_region]
n_snps = pos_region.shape[0]
haps_region = haps_ehh[loc_region]
# compute delta allele frequency
subpops = {set1: sorted(target_sets[set1]), set2: sorted(target_sets[set2])}
acs = haps_region.count_alleles_subpops(subpops)
aaf1 = acs[set1].to_frequencies()[:, 1]
aaf2 = acs[set2].to_frequencies()[:, 1]
delta_af = np.fabs(aaf1 - aaf2)
loc_nz = delta_af > 0.1
pos_region = pos_region[loc_nz]
delta_af = delta_af[loc_nz]
# plot
fig, ax = plt.subplots(figsize=figsize)
sns.despine(ax=ax, offset=5)
# x = np.concatenate([-np.arange(0, idx_core)[::-1],
# np.arange(idx_core, n_snps) - idx_core])
# y = delta_af
ax.plot(pos_region, delta_af, marker='o', mec='k', mfc='none', linestyle=' ')
loc_diff = delta_af > .9
pos_diff = pos_region[loc_diff]
ax.set_xticks(pos_diff)
ax.set_xticklabels(pos_diff - core, rotation=90)
ax.axvline(core, linestyle='--', lw=1)
ax.grid(axis='both')
ax.set_title('%s vs %s' % (set1, set2))
ax.set_xlim(core - flank, core + flank)
ax.set_ylabel('Delta allele frequency')
ax.set_xlabel('Position relative to core (bp)')
ax.set_ylim(-0.03, 1.03)
In [28]:
def analyse_995F_core_region():
core = aa2pos['L995F']
flank = 5000
for set1, set2 in itertools.combinations(cluster_labels[:5], 2):
plot_delta_af(core, flank, set1, set2)
plt.show()
analyse_995F_core_region()
In [29]:
def analyse_995S_core_region():
core = aa2pos['L995S']
flank = 5000
for set1, set2 in itertools.combinations(cluster_labels[5:], 2):
plot_delta_af(core, flank, set1, set2)
plt.show()
analyse_995S_core_region()
In [30]:
def analyse_L2_core_region():
core = aa2pos['L995S']
flank = 3000
plot_delta_af(core, flank, 'L2', 'wt')
plt.show()
analyse_L2_core_region()
In [31]:
def extract_core_haplotypes(start, stop):
loc_region = pos_ehh.locate_range(start, stop)
haps_region = haps_ehh[loc_region]
ret = dict()
distinct_sets = haps_region.distinct()
wt_ix = 1
fx_ix = 1
sx_ix = 1
lx_ix = 1
for s in distinct_sets:
isecs = [len(s.intersection(target_sets[l])) for l in target_labels]
n_995F = len(s.intersection(haps_L995F))
n_995S = len(s.intersection(haps_L995S))
n_1527T = len(s.intersection(haps_I1527T))
n_490I_1 = len(s.intersection(haps_M490I_1))
label = target_labels[np.argmax(isecs)]
if label == 'wt' or label in ret:
if n_995F:
label = 'FX%02d' % fx_ix
fx_ix += 1
elif n_995S:
label = 'SX%02d' % sx_ix
sx_ix += 1
elif n_1527T or n_490I_1:
label = 'LX%02d' % lx_ix
lx_ix += 1
else:
label = 'WT%02d' % wt_ix
wt_ix += 1
ret[label] = s
clust_isecs = ['%s:%s/%s (%.1f%%);' % (l, len(s.intersection(o)), len(o), len(s.intersection(o)) * 100 / len(o))
for l, o in sorted(target_sets.items()) if s.intersection(o)]
mut_isecs = ['L995F:{}/{};'.format(n_995F, len(haps_L995F)),
'L995S:{}/{};'.format(n_995S, len(haps_L995S)),
'I1527T:{}/{};'.format(n_1527T, len(haps_I1527T)),
'M490I.1:{}/{};'.format(n_490I_1, len(haps_M490I_1))]
print('\t'.join([label, str(len(s))] + clust_isecs + mut_isecs))
return ret
In [32]:
start = aa2pos['L995S'] - 2208
stop = aa2pos['L995S'] + 3870
core_haps = extract_core_haplotypes(start, stop)
In [33]:
# This is enough to get all core haplotypes, but includes a couple of wt GW in L2 which bring down EHH
# start = aa2pos['L995S'] - 402
# stop = aa2pos['L995S'] + 3870
# core_haps = extract_core_haplotypes(start, stop)
In [34]:
with open('../data/core_haps.pkl', mode='wb') as f:
pickle.dump(core_haps, f)
In [35]:
cluster_palette = sns.color_palette('nipy_spectral', n_colors=len(target_labels) - 1, desat=0.8)
sns.palplot(cluster_palette)
plt.gca().set_xticklabels(target_labels)
cluster_colors = dict(zip(target_labels, cluster_palette)) # don't include wt
In [36]:
sns.set_style('white')
sns.set_style('ticks')
In [37]:
def plot_ehh_decay(core, core_haps, flank=1000000, ax=None, cluster_colors=cluster_colors, min_cnt=14):
loc_right = pos_ehh.locate_range(core, core + flank)
haps_right = haps_ehh[loc_right]
pos_right = pos_ehh[loc_right]
loc_left = pos_ehh.locate_range(core - flank, core)
haps_left = haps_ehh[loc_left]
pos_left = pos_ehh[loc_left]
fig = None
if ax is None:
fig, ax = plt.subplots(figsize=(7, 2.4))
sns.despine(ax=ax, offset=5)
labels = set()
for l, s in core_haps.items():
if len(s) < min_cnt:
continue
labels.add(l)
if l in cluster_colors:
color = cluster_colors[l]
lw = 2
else:
color = 'k'
lw = .5
haps_right_core = haps_right.take(sorted(s), axis=1)
haps_left_core = haps_left.take(sorted(s), axis=1)
ehh_decay_right = allel.ehh_decay(haps_right_core)
ehh_decay_left = allel.ehh_decay(haps_left_core[::-1])
ax.plot(pos_right, ehh_decay_right, color=color, lw=lw)
ax.plot(pos_left, ehh_decay_left[::-1], color=color, lw=lw)
ax.set_xlim(core - flank, core + flank)
ax.set_xlabel('Chromosome 2L position (Mbp)')
ax.set_xticklabels(['%.1f' % (t/1e6) for t in ax.get_xticks()])
ax.set_ylim(0, 1.02)
ax.set_ylabel('EHH')
# ax.grid(axis='y')
handles = []
for l in sorted(labels):
if l in cluster_colors:
color = cluster_colors[l]
lw = 2
label = l
handles.append(plt.Line2D([0, 0], [0, 0], color=color, lw=lw, label=l))
color = 'k'
lw = .5
handles.append(plt.Line2D([0, 0], [0, 0], color=color, lw=lw, label='$wt$'))
legend = ax.legend(handles=handles, bbox_to_anchor=(1, 1), loc='upper right', labelspacing=.3,
frameon=False, framealpha=1, title='Core haplotype')
plt.setp(legend.get_title(), fontsize=base_font_size)
if fig:
fig.tight_layout()
In [38]:
plot_ehh_decay(aa2pos['L995S'], {l: s for l, s in core_haps.items() if l.startswith('S') or l.startswith('WT')})
In [39]:
plot_ehh_decay(aa2pos['L995S'], {l: s for l, s in core_haps.items() if l.startswith('F') or l.startswith('WT')})
In [40]:
plot_ehh_decay(aa2pos['L995S'], {l: s for l, s in core_haps.items() if l.startswith('L') or l.startswith('WT')})
In [41]:
def fig_kdr_ehh_decay(figsize=(7, 5.5), fn=None, save_dpi=150):
core = aa2pos['L995S']
flank = 1000000
fig = plt.figure(figsize=figsize)
gs = mpl.gridspec.GridSpec(4, 1, height_ratios=[4, 4, 4, 1])
ax = fig.add_subplot(gs[0])
sns.despine(ax=ax, bottom=True, offset=5)
plot_ehh_decay(core, {l: s for l, s in core_haps.items() if l.startswith('F') or l.startswith('WT')},
ax=ax, flank=flank)
ax.set_xticks([])
ax.set_xlabel('')
ax.set_ylabel('')
ax = fig.add_subplot(gs[1])
sns.despine(ax=ax, bottom=True, offset=5)
plot_ehh_decay(core, {l: s for l, s in core_haps.items() if l.startswith('S') or l.startswith('WT')},
ax=ax, flank=flank)
ax.set_xticks([])
ax.set_xlabel('')
ax.set_ylabel('EHH')
ax = fig.add_subplot(gs[2])
sns.despine(ax=ax, bottom=True, offset=5)
plot_ehh_decay(core, {l: s for l, s in core_haps.items() if l.startswith('L') or l.startswith('WT')}, ax=ax, flank=flank)
ax.set_xticks([])
ax.set_xlabel('')
ax.set_ylabel('')
ax = fig.add_subplot(gs[3])
sns.despine(ax=ax, offset=5)
plot_genes(phase1_ar3.genome_fn, phase1_ar3.geneset_agamp42_fn, ax=ax,
chrom='2L', start=core-flank, stop=core+flank, labels=gene_labels, label=True, label_unnamed=False)
ax.set_xlabel('Chromosome 2L position (Mbp)')
ax.set_xticklabels(['%.1f' % (t/1e6) for t in ax.get_xticks()])
ax.set_ylabel('Genes')
fig.tight_layout()
if fn:
fig.savefig(fn, bbox_inches='tight', dpi=save_dpi, jpeg_quality=100)
fig_kdr_ehh_decay(fn='../artwork/ehh_decay_old_clusters.pdf')
In [42]:
def build_f_breakdown():
global f_breakdown
f_breakdown = dict()
f1_all = core_haps['F1']
f1_vanilla = set(f1_all)
for mut in 'T791M', 'N1570Y', 'E1597G', 'K1603T', 'V1853I', 'I1868T', 'P1874S', 'P1874L', 'A1934V':
k = 'F1+{}'.format(mut)
s_mut = set(np.nonzero(haps[pos.locate_key(aa2pos[mut]), :] == 1)[0])
s = f1_all.intersection(s_mut)
print(k, len(s_mut), len(s))
f_breakdown[k] = s
f1_vanilla = f1_vanilla - s
f_breakdown['F1'] = f1_vanilla
print('F1', len(f1_vanilla))
build_f_breakdown()
# for k, s in f_breakdown:
# print(k, len(s))
In [43]:
f_breakdown_labels = sorted(f_breakdown)
In [44]:
f_breakdown_palette = sns.color_palette('nipy_spectral', n_colors=len(f_breakdown), desat=0.8)
sns.palplot(f_breakdown_palette)
plt.gca().set_xticklabels(f_breakdown_labels)
f_breakdown_colors = dict(zip(f_breakdown_labels, f_breakdown_palette)) # don't include wt
In [45]:
plot_ehh_decay(aa2pos['L995S'], f_breakdown, cluster_colors=f_breakdown_colors, min_cnt=1)
In [46]:
import hapclust
In [47]:
# set core SNP
core_pos = aa2pos['L995S']
In [48]:
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(haps_ehh, pos_ehh, core_pos)
In [49]:
def pairwise_haplotype_sharing(haps, dist, jitter=False):
haps = allel.HaplotypeArray(haps)
n_haplotypes = haps.n_haplotypes
# compute length (no. variants) of shared prefix between pairs
pspl = allel.opt.stats.pairwise_shared_prefix_lengths(
np.asarray(haps, dtype='i1')
)
# compute length (physical distance) of shared prefix between neighbours
pspd = hapclust._shared_distance(pspl, dist, jitter=jitter)
return pspd
In [50]:
@functools.lru_cache(maxsize=None)
def compute_pspd(hap_ixs):
# compute pairwise shared prefix distance
pspd_left = pairwise_haplotype_sharing(haps_ehh_left.take(hap_ixs, axis=1), dist_ehh_left, jitter=True)
pspd_right = pairwise_haplotype_sharing(haps_ehh_right.take(hap_ixs, axis=1), dist_ehh_right, jitter=True)
return pspd_left, pspd_right
In [51]:
core_hap_labels = sorted([l for l, s in core_haps.items() if len(s) >= 14])
core_hap_labels = (
[l for l in core_hap_labels if l.startswith('F')] +
[l for l in core_hap_labels if l.startswith('S')] +
[l for l in core_hap_labels if l.startswith('L')] +
[l for l in core_hap_labels if l.startswith('WT')]
)
In [52]:
pspds = [compute_pspd(tuple(sorted(core_haps[l]))) for l in core_hap_labels]
In [53]:
pspds_adjusted = [pl * 0.3 * 2e-6 + pr * 2e-6 for pl, pr in pspds]
In [54]:
def boxplot_pspds(x, xlabels, figsize=(7, 3), colors=None, fn=None, ylabel=None, save_dpi=150):
fig, ax = plt.subplots(figsize=figsize)
bx = ax.boxplot(x, notch=True, bootstrap=1000, whis=[5, 95], showfliers=False,
patch_artist=True,
medianprops=dict(linestyle='-', color='k'))
#ax.set_yscale('log')
ax.set_xticklabels(xlabels)
ax.set_xlabel('Core haplotype')
if ylabel:
ax.set_ylabel(ylabel)
if colors:
for patch, color in zip(bx['boxes'], colors):
patch.set_facecolor(color)
ax.grid(axis='y')
#ax.set_ylim(bottom=100000);
if fn:
fig.savefig(fn, bbox_inches='tight', dpi=save_dpi, jpeg_quality=100)
In [55]:
boxplot_pspds(pspds_adjusted, xlabels=[l.replace('WT0', '$wt$') for l in core_hap_labels],
colors=[cluster_colors.get(k, 'w') for k in core_hap_labels],
fn='../artwork/clusters_compare_pspd.pdf',
ylabel='Shared haplotype length (cM)');
In [56]:
import scikits.bootstrap as bootstrap
In [57]:
for x, l in zip(pspds_adjusted, core_hap_labels):
ci = bootstrap.ci(x, statfunction=np.median, n_samples=10000, method='pi')
print('{} median {:.3f} cM (95% CI [{:.3f} - {:.3f}])'.format(l, np.median(x), ci[0], ci[1]))
In [58]:
for x, l in zip(pspds_adjusted, core_hap_labels):
ci = bootstrap.ci(x, statfunction=np.median, n_samples=10000, method='bca')
print('{} median {:.3f} cM (95% CI [{:.3f} - {:.3f}])'.format(l, np.median(x), ci[0], ci[1]))
In [ ]: