In [1]:
%run setup.ipynb
%matplotlib notebook
# %reload_ext autoreload
# %autoreload 1
# %aimport hapclust
import hapclust
In [2]:
# obtain data from unphased callset - only needed for variant annotations
callset_pass = phase1_ar31.callset_pass
pos_pass = allel.SortedIndex(callset_pass['2L/variants/POS'])
ann_pass = callset_pass['2L/variants/ANN'][:][['Annotation', 'HGVS_p']]
In [3]:
# setup haplotype data
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])
In [4]:
pos_kdr_s = 2422651
pos_kdr_f = 2422652
In [5]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region
Out[5]:
In [6]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_pass.locate_intersection(pos_phased_region)
np.count_nonzero(loc1)
Out[6]:
In [7]:
ann_phased_region = ann_pass[loc1]
ann_phased_region
Out[7]:
In [8]:
collections.Counter(ann_phased_region['Annotation'])
Out[8]:
In [9]:
# exclude cross parents
haps_phased_region = genotypes_phased[loc_region].to_haplotypes()[:, :-16].compute()
In [10]:
# perform allele count - needed to locate singletons
ac_phased_region = haps_phased_region.count_alleles(max_allele=1)
In [11]:
# convenience, define the Vgsc gene region
region_vgsc = SeqFeature('2L', 2358158, 2431617, label='Vgsc')
region_vgsc
Out[11]:
In [12]:
loc_vgsc = pos_phased_region.locate_range(region_vgsc.start, region_vgsc.end)
loc_vgsc
Out[12]:
In [13]:
haps_vgsc = haps_phased_region[loc_vgsc]
In [14]:
ac_vgsc = haps_vgsc.count_alleles(max_allele=1)
In [15]:
ann_vgsc = ann_phased_region[loc_vgsc]
In [16]:
loc_vgsc_missense = (ann_vgsc['Annotation'] == b'missense_variant') & (ac_vgsc[:, 1] > 7)
np.count_nonzero(loc_vgsc_missense)
Out[16]:
In [17]:
haps_vgsc_missense = haps_vgsc[loc_vgsc_missense]
In [18]:
lbl_vgsc_missense = [l[2:] for l in ann_vgsc[loc_vgsc_missense]['HGVS_p'].astype('U')]
lbl_vgsc_missense
Out[18]:
Here we divide haplotype data up into two, with an "EHH" set containing no singletons and only neutral variants, which we'll use for analysis of haplotype sharing, and a "mut" set containing singletons and putatively non-neutral variants that we will use for analysis of mutations on shared haplotypes.
In [19]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') |
(ann_phased_region['Annotation'] == b'intron_variant') |
(ann_phased_region['Annotation'] == b'downstream_gene_variant') |
(ann_phased_region['Annotation'] == b'upstream_gene_variant') |
(ann_phased_region['Annotation'] == b'synonymous_variant') |
(ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
(ann_phased_region['Annotation'] == b'5_prime_UTR_variant')
)
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape
Out[19]:
In [20]:
# locate singletons - will exclude from EHH analysis
loc_sgl = ac_phased_region.min(axis=1) == 1
loc_nosgl = ac_phased_region.min(axis=1) > 1
np.count_nonzero(loc_sgl), np.count_nonzero(loc_nosgl), loc_nosgl.shape
Out[20]:
In [21]:
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl
np.count_nonzero(loc_ehh), loc_ehh.shape
Out[21]:
In [22]:
# these are the variants to use for mutational distance
#loc_mut = loc_sgl
# include non-neutral mutations
loc_mut = loc_sgl | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape
Out[22]:
In [23]:
haps_mut = haps_phased_region[loc_mut]
In [24]:
haps_ehh = haps_phased_region[loc_ehh]
In [25]:
pos_ehh = pos_phased_region[loc_ehh]
In [26]:
pos_mut = pos_phased_region[loc_mut]
In [27]:
pos_mut.locate_key(pos_kdr_s)
Out[27]:
In [28]:
pos_mut.locate_key(pos_kdr_f)
Out[28]:
In [29]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()
Out[29]:
In [30]:
core_pos = pos_kdr_f
In [31]:
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(
haps_ehh, pos_ehh, core_pos
)
In [32]:
# this gives the distance from the core position to each downstream variant, moving away from the core
dist_ehh_right
Out[32]:
In [33]:
# this gives the distance from the core position to each upstream variant, moving away from the core
dist_ehh_left
Out[33]:
In [34]:
dist_ehh_right.shape, dist_ehh_left.shape
Out[34]:
In [35]:
dist_ehh_right.min(), dist_ehh_left.min()
Out[35]:
In [36]:
# split the mutations dataset
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(
haps_mut, pos_mut, core_pos
)
In [37]:
dist_mut_right
Out[37]:
In [38]:
dist_mut_left
Out[38]:
In [39]:
dist_mut_right.shape, dist_mut_left.shape
Out[39]:
In [40]:
dist_mut_right.min(), dist_mut_left.min()
Out[40]:
In [41]:
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape
Out[41]:
In [42]:
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]
In [43]:
idx_sorted_right, nspl_right, nspd_right, muts_right = hapclust.neighbour_haplotype_sharing(
haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right
)
In [44]:
idx_sorted_left, nspl_left, nspd_left, muts_left = hapclust.neighbour_haplotype_sharing(
haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left
)
In [45]:
nspl_right.min(), nspl_right.max()
Out[45]:
In [46]:
nspl_left.min(), nspl_left.max()
Out[46]:
In [47]:
nspd_right.min(), nspd_right.max()
Out[47]:
In [48]:
nspd_left.min(), nspd_left.max()
Out[48]:
In [49]:
muts_right.min(), muts_right.max()
Out[49]:
In [50]:
muts_left.min(), muts_left.max()
Out[50]:
In [51]:
# compute accessible lengths - needed for mutation analysis
nspd_right_accessible = hapclust.haplotype_accessible_length(nspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
nspd_left_accessible = hapclust.haplotype_accessible_length(nspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')
In [52]:
# get some diagnostics on accessibility on left versus right flanks
fig, ax = plt.subplots(figsize=(5, 5))
x = nspd_right
y = nspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = nspd_left
y = nspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Neighbour shared haplotype length (bp)')
fig.tight_layout();
In [53]:
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)
Out[53]:
In [54]:
# assume constant recombination rate
rr_right = 1.5e-8
# adjust recombination rate on left flank (factor derived from pairwise analysis below)
rr_left = rr_right * 0.37
# assumed mutation rate
mu_right = 3.5e-9
# adjust mutation rate on left flank (factor derived from pairwise analysis below)
mu_left = mu_right * 0.61
In [55]:
pops_right = df_haplotypes.population[idx_sorted_right]
pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]
fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_right,
nspd_accessible=nspd_right_accessible,
muts=muts_right,
haps_display=haps_vgsc_missense[:, idx_sorted_right],
haps_display_vlbl=lbl_vgsc_missense,
pop_colors=pop_colors_right,
nspd_cut=2e4,
nspd_ylim=(1e3, 3e6),
that_ylim=(2e1, 5e5),
muts_ylim=(0, 25),
mu=mu_right, rr=rr_right,
fig=fig)
fig.suptitle('Right flank');
In [56]:
pops_left = df_haplotypes.population[idx_sorted_left]
pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]
fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_left,
muts=muts_left,
nspd_accessible=nspd_left_accessible,
haps_display=haps_vgsc_missense[:, idx_sorted_left],
haps_display_vlbl=lbl_vgsc_missense,
pop_colors=pop_colors_left,
nspd_cut=2e4,
nspd_ylim=(1e3, 3e6),
that_ylim=(2e1, 5e5),
muts_ylim=(0, 25),
mu=mu_left, rr=rr_left,
fig=fig)
fig.suptitle('Left flank');
In [57]:
# get some diagnostics on estimates of t hat on left versus right flanks
fig = plt.figure()
ax = fig.add_subplot(111)
x = nspd_left
t_hat = (1 + muts_left) / (2 * (nspd_left * rr_left + nspd_left_accessible * mu_left))
ax.hist(t_hat[(nspd_left > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Upstream', color='b')
ax.axvline(np.median(t_hat), linestyle='--', color='b', lw=2)
t_hat = (1 + muts_right) / (2 * (nspd_right * rr_right + nspd_right_accessible * mu_right))
ax.hist(t_hat[(nspd_right > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Downstream', color='g')
ax.axvline(np.median(t_hat), linestyle='--', color='g', lw=2)
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)')
ax.legend();
In [58]:
pspl_right, pspd_right, pmuts_right = hapclust.pairwise_haplotype_sharing(
haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right, jitter=False)
In [59]:
pspl_left, pspd_left, pmuts_left = hapclust.pairwise_haplotype_sharing(
haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left, jitter=False)
In [60]:
pspl_right.shape, pspl_left.shape
Out[60]:
In [61]:
pspd_right.min(), pspd_right.max()
Out[61]:
In [62]:
pspd_left.min(), pspd_left.max()
Out[62]:
In [63]:
pspd_right_accessible = hapclust.haplotype_accessible_length(pspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
pspd_left_accessible = hapclust.haplotype_accessible_length(pspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')
In [64]:
# check again accessibility
fig, ax = plt.subplots(figsize=(5, 5))
x = pspd_right
y = pspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = pspd_left
y = pspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Pairwise shared haplotype length')
fig.tight_layout();
Check recombination rate on left versus right flank.
In [65]:
palette = sns.color_palette()
sns.palplot(palette);
In [66]:
# compare shared haplotype length on left versus right flanks
x = pspd_left
y = pspd_right
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e1, 1e7
df = pandas.DataFrame({'x': x, 'y': y})
# linear regression
r = sfa.ols('y ~ x + 0', data=df).fit()
# plot the regression line
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# # summarise the data in bins to provide further visuals
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False,
# scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=100));
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (bp)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params
Out[66]:
In [67]:
# compare shared haplotype length after applying adjustment for different
# recombination rates on left and right flanks
x = pspd_left * rr_left
y = pspd_right * rr_right
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e-7, 1e-1
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False,
# scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (Morgan)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params
Out[67]:
In [68]:
# check where the heterochromatin boundary is supposed to be
phase1_ar3.tbl_chromatin
Out[68]:
In [69]:
# check mutation rate on left versus right flank.
x = pmuts_left / pspd_left_accessible
y = pmuts_right / pspd_right_accessible
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]
lim = 1e-6, 1e-1
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False,
# scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations ($bp^{-1}$)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');
Out[69]:
In [70]:
# check mutations on left versus right flank after correcting mutation rate
x = pmuts_left / (mu_left * pspd_left_accessible)
y = pmuts_right / (mu_right * pspd_right_accessible)
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]
lim = 1e2, 1e7
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False,
# scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations (TODO units?)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');
Out[70]:
In [71]:
# now compare estimats of t_hat on left and right flanks, using all adjustments
pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
x = np.log10(pt_hat_left)
y = np.log10(pt_hat_right)
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1, 6.5
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False,
# scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('$\hat{t}$')
ticks = np.arange(lim[0], int(lim[1]) + 1)
ticklabels = ['$10^{%s}$' % t for t in ticks]
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels)
ax.set_yticks(ticks)
ax.set_yticklabels(ticklabels)
r.params
Out[71]:
A few diagnostic comparisons between pairwise and earlier neighbour analyses...
In [72]:
pspl_right.max(), nspl_right.max()
Out[72]:
In [73]:
pspd_right.max(), nspd_right.max()
Out[73]:
In [74]:
pspl_left.max(), nspl_left.max()
Out[74]:
In [75]:
pspd_left.max(), nspd_left.max()
Out[75]:
In [76]:
pmuts_right.shape, pmuts_left.shape
Out[76]:
In [77]:
muts_right.max(), muts_left.max()
Out[77]:
In [78]:
pmuts_right.max(), pmuts_left.max()
Out[78]:
In [79]:
np.bincount(pmuts_right)
Out[79]:
In [80]:
np.bincount(pmuts_left)
Out[80]:
In [81]:
fig, ax = plt.subplots()
ax.hist([pmuts_right, pmuts_left], stacked=True, bins=np.arange(25));
Look at the distribution of ages on left, right and combined flanks...
In [82]:
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
fig, ax = plt.subplots()
ax.hist(pt_hat_right, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');
In [83]:
pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
fig, ax = plt.subplots()
ax.hist(pt_hat_left, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');
In [84]:
pspd_both = pspd_left + pspd_right
pmuts_both = pmuts_left + pmuts_right
psgl_both = pspd_left * rr_left + pspd_right * rr_right
psml_both = pspd_left_accessible * mu_left + pspd_right_accessible * mu_right
pt_hat_both = (1 + pmuts_both) / (2 * (psgl_both + psml_both))
fig, ax = plt.subplots()
ax.hist(pt_hat_both, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');
In [85]:
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average',
n_clusters=14):
"""This function plots a dendrogram using scipy and provides some utilities for annotating clusters."""
z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
fig = plt.figure(figsize=(16, 8), )
gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)
ax = fig.add_subplot(gs[0])
sns.despine(ax=ax, offset=3, bottom=True, top=False)
r = scipy.cluster.hierarchy.dendrogram(
z, no_labels=True, count_sort=True,
color_threshold=0,
above_threshold_color='k',
ax=ax)
ax.set_ylim(*ylim)
ax.set_yscale(yscale)
# ax.set_ylim(bottom=-1000)
xmin, xmax = ax.xaxis.get_data_interval()
xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
xticks = xticklabels / len(df_haplotypes)
xticks = (xticks * (xmax - xmin)) + xmin
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlabel('Haplotypes')
ax.xaxis.set_label_position('top')
ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
cluster_palette = sns.color_palette('Set3', n_colors=12)
if cut_height:
ax.axhline(cut_height, linestyle='--', color='k')
# find clusters
f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
# compute cluster sizes
fsz = np.bincount(f)
# sort largest first
fsort = np.argsort(fsz)[::-1]
# take largest n
fsort = fsort[:n_clusters]
# get haplotype indices for each cluster
clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster])
for cluster in clusters]
ixs = np.argsort([min(cl) for cl in clusters_leaves])
clusters = [clusters[i] for i in ixs]
clusters_leaves = [clusters_leaves[i] for i in ixs]
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
# ax.axvline(x1*10, color='k', linestyle='--')
# ax.axvline(x2*10, color='k', linestyle='--')
ax.fill_between([x1*10, x2*10], 0, cut_height, color=color, alpha=.4, zorder=20)
# ax.axvspan(x1*10, x2*10, color=color, zorder=-20, alpha=.5)
ax.text((x1*10 + x2*10) / 2, ylim[0], str(i), ha='center', va='top')
ax = fig.add_subplot(gs[1], )
sns.despine(ax=ax, left=True, bottom=True)
pops = df_haplotypes.population[r['leaves']]
pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
ax.set_xlim(0, len(df_haplotypes))
ax.set_yticks([])
ax.set_ylabel('Population', rotation=0, ha='right', va='center')
ax.set_xticks([])
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
ax.axvline(x1, color='k', linestyle='--', zorder=20)
ax.axvline(x2, color='k', linestyle='--', zorder=20)
ax = fig.add_subplot(gs[2])
hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
ax.set_xlim(0, len(df_haplotypes))
ax.set_xlabel('Haplotypes')
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
ax.axvline(x1, color='k', linestyle='--', zorder=20)
ax.axvline(x2, color='k', linestyle='--', zorder=20)
ax.axvspan(x1, x2, color=color, zorder=20, alpha=.4)
gs.tight_layout(fig, h_pad=0)
return clusters
In [86]:
plot_dendrogram(pt_hat_right);
In [87]:
plot_dendrogram(pt_hat_left);
In [88]:
# see what it looks like on a linear scale
plot_dendrogram(pt_hat_both, yscale='linear', ylim=(-1000, 4e5));
In [89]:
# canonical dendrogram we'll use for cluster analyses
clusters = plot_dendrogram(pt_hat_both, cut_height=2e3)
In [90]:
def plot_cladogram(dist, yscale='symlog', yscale_kws=None, ylim=(10, 1e6), count_sort=True,
linkage_method='average', n_clusters=14, fill_threshold=0, leaf_height=0, linewidth=2):
# perform hierarchical clustering
z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
# needed for getting leaves in right order
r = scipy.cluster.hierarchy.dendrogram(
z, count_sort=count_sort, no_plot=True)
# setup figure
fig = plt.figure(figsize=(16, 8), )
gs = mpl.gridspec.GridSpec(nrows=2, ncols=1, height_ratios=[6, 4], hspace=0)
# plot cladogram
ax = fig.add_subplot(gs[0])
sns.despine(ax=ax, offset=3, bottom=True, top=False)
colors = [phase1_ar3.pop_colors[p] for p in df_haplotypes.population]
hapclust.cladogram(z, count_sort=True, colors=colors,
fill_threshold=fill_threshold, leaf_height=leaf_height,
plot_kws=dict(linewidth=linewidth),
fill_kws=dict(linewidth=linewidth),
ax=ax)
ax.set_ylim(*ylim)
if yscale_kws is None:
yscale_kws = dict()
ax.set_yscale(yscale, **yscale_kws)
# ax.set_ylim(bottom=-1000)
xmin, xmax = ax.xaxis.get_data_interval()
xticklabels = np.array(list(range(0, len(r['leaves']), 200)) + [len(r['leaves'])])
xticks = xticklabels / len(r['leaves'])
xticks = (xticks * (xmax - xmin)) + xmin
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlabel('Haplotypes')
ax.xaxis.set_label_position('top')
ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
ax.grid(axis='y', linestyle='--', zorder=-1e9)
# plot display haplotypes
ax = fig.add_subplot(gs[1])
hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
ax.set_xlim(0, len(r['leaves']))
ax.set_xlabel('Haplotypes')
gs.tight_layout(fig, h_pad=0)
In [91]:
plot_cladogram(pt_hat_both, fill_threshold=0, linewidth=.5, leaf_height=10,
ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))
In [92]:
# option to fill clades
plot_cladogram(pt_hat_both, fill_threshold=2e3, linewidth=.5, leaf_height=10,
ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))
In [93]:
haps_vgsc_missense
Out[93]:
In [94]:
[len(c) for c in clusters]
Out[94]:
In [95]:
# pick these out of the clusters we made earlier
f1 = clusters[4]
f2 = clusters[8]
f3 = clusters[3]
f4 = clusters[2]
f5 = clusters[9]
s1 = clusters[12]
s2 = clusters[10]
s3 = clusters[13]
s4 = clusters[7]
l1 = clusters[6]
In [96]:
# rhg = "resistance haplogroups"
rhg = [f1, f2, f3, f4, f5, s1, s2, s3, s4, l1]
rhg_labels = 'F1 F2 F3 F4 F5 S1 S2 S3 S4 L1'.split()
In [97]:
for g, l in zip(rhg, rhg_labels):
print(l)
print(collections.Counter(df_haplotypes.population[sorted(g)]).most_common())
In [98]:
# compare F haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[:5], rhg_labels[:5], palette):
ixs = allel.condensed_coords_within(g, len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False,
kde_kws=dict(linewidth=2, color=cl))
ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');
In [99]:
# compare S haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[5:], rhg_labels[5:], palette):
ixs = allel.condensed_coords_within(g, len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False,
kde_kws=dict(linewidth=2, color=cl))
ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');
In [100]:
def bootstrap(x, n, f):
dtype = np.array(f(x)).dtype
out = np.zeros(n, dtype=dtype)
for i in range(n):
ix = np.random.choice(x.shape[0], size=x.shape[0], replace=True)
out[i] = f(x[ix])
return out
In [101]:
def distplot_cluster_pop_t_hat(cluster, title):
cluster = list(cluster)
pops_clst = df_haplotypes.population[cluster]
pops_unique = pops_clst.unique()
pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in pops_unique]
fig, ax = plt.subplots()
for g, l in zip(pops_subclusters, pops_unique):
ixs = allel.condensed_coords_within(g, len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
color = phase1_ar3.pop_colors[l]
md = np.median(x)
bs = bootstrap(x, 1000, np.median)
lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md,
10**np.percentile(bs, 2.5),
10**np.percentile(bs, 97.5))
sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False,
kde_kws=dict(linewidth=2, color=color, label=lbl))
ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5),
color=color, alpha=.3)
ax.axvline(md, color=color, lw=2, linestyle='--')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)')
ax.set_title(title)
ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
fig.tight_layout()
In [102]:
distplot_cluster_pop_t_hat(f1, 'Haplogroup F1')
In [103]:
distplot_cluster_pop_t_hat(f2, 'Haplogroup F2')
In [104]:
distplot_cluster_pop_t_hat(f3, 'Haplogroup F3')
In [105]:
distplot_cluster_pop_t_hat(f4, 'Haplogroup F4')
In [106]:
distplot_cluster_pop_t_hat(f5, 'Haplogroup F5')
In [107]:
distplot_cluster_pop_t_hat(s1, 'Haplogroup S1')
In [108]:
distplot_cluster_pop_t_hat(s2, 'Haplogroup S2')
In [109]:
distplot_cluster_pop_t_hat(s3, 'Haplogroup S3')
In [110]:
distplot_cluster_pop_t_hat(s4, 'Haplogroup S4')
In [111]:
distplot_cluster_pop_t_hat(l1, 'Haplogroup L1')
In [112]:
for i, l in enumerate(lbl_vgsc_missense):
print(i, l)
In [113]:
cluster = sorted(f1)
haps_missense_clst = haps_vgsc_missense[:, cluster]
haps_missense_clst
Out[113]:
In [114]:
np.count_nonzero(haps_missense_clst[2, :] == 1)
Out[114]:
In [115]:
# compare mutations
ix_muts = [2, 7, 8, 9, 10, 11, 12, 14, 15]
pops_subclusters = [np.array(cluster)[(haps_missense_clst[i, :] == 1)] for i in ix_muts]
# add in "vanilla" haplotypes with only kdr
pops_subclusters.append(np.array(cluster)[np.all(haps_missense_clst[ix_muts, :] == 0, axis=0)])
lbls = ['F1+%s' % lbl_vgsc_missense[i] for i in ix_muts]
lbls.append('F1')
fig, ax = plt.subplots()
palette = sns.color_palette('hls', n_colors=len(ix_muts)+1)
for g, l, color in zip(pops_subclusters, lbls, palette):
if len(g) > 0:
log(l, len(g))
ixs = allel.condensed_coords_within(g, len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
md = np.median(x)
bs = bootstrap(x, 1000, np.median)
lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md,
10**np.percentile(bs, 2.5),
10**np.percentile(bs, 97.5))
sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False,
kde_kws=dict(linewidth=2, color=color, label=lbl))
# ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5),
# color=color, alpha=.3)
ax.axvline(md, color=color, lw=2, linestyle='--')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Density')
ax.set_xlim(1, 3.5)
ticks = [1, 2, 3]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)')
ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
fig.tight_layout()
In [116]:
def distplot_cluster_pop_between_t_hat(cluster, pop1, pop2, title):
cluster = list(cluster)
pops_clst = df_haplotypes.population[cluster]
pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in [pop1, pop2]]
fig, ax = plt.subplots()
for g, l in zip(pops_subclusters, [pop1, pop2]):
ixs = allel.condensed_coords_within(g, len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
color = phase1_ar3.pop_colors[l]
md = np.median(x)
bs = bootstrap(x, 1000, np.median)
lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md,
10**np.percentile(bs, 2.5),
10**np.percentile(bs, 97.5))
sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False,
kde_kws=dict(linewidth=2, color=color, label=lbl))
ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5),
color=color, alpha=.3)
ax.axvline(md, color=color, lw=2, linestyle='--')
ixs = allel.condensed_coords_between(pops_subclusters[0], pops_subclusters[1], len(df_haplotypes))
x = np.log10(pt_hat_both[ixs])
color = 'k'
md = np.median(x)
bs = bootstrap(x, 1000, np.median)
lbl = '%s vs %s (%.0f; 95%% CI [%.0f, %.0f])' % (pop1, pop2, 10**md,
10**np.percentile(bs, 2.5),
10**np.percentile(bs, 97.5))
sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False,
kde_kws=dict(linewidth=2, color=color, label=lbl))
ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5),
color=color, alpha=.3)
ax.axvline(md, color=color, lw=2, linestyle='--')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)')
ax.set_title(title)
ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
fig.tight_layout()
In [117]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'BFM', 'Haplogroup F1')
In [118]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'CMS', 'Haplogroup F1')
In [119]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'AOM', 'Haplogroup F1')
In [120]:
distplot_cluster_pop_between_t_hat(f1, 'BFM', 'AOM', 'Haplogroup F1')
In [121]:
distplot_cluster_pop_between_t_hat(f1, 'CMS', 'AOM', 'Haplogroup F1')
In [122]:
distplot_cluster_pop_between_t_hat(f4, 'CMS', 'GAS', 'Haplogroup F4')
In [123]:
distplot_cluster_pop_between_t_hat(f5, 'CMS', 'GAS', 'Haplogroup F5')
In [124]:
distplot_cluster_pop_between_t_hat(s2, 'CMS', 'GAS', 'Haplogroup S2')
In [125]:
distplot_cluster_pop_between_t_hat(s3, 'UGS', 'KES', 'Haplogroup S3')
TODO: break down F1 by population and mutation
In [ ]: