This notebook demonstrates some functions from the hapclust package for analysing haplotype structure using the estimator of allele age from Mathieson and McVean.

Setup

Setup callsets


In [1]:
%run setup.ipynb
%matplotlib notebook
# %reload_ext autoreload
# %autoreload 1
# %aimport hapclust
import hapclust



In [2]:
# obtain data from unphased callset - only needed for variant annotations
callset_pass = phase1_ar31.callset_pass
pos_pass = allel.SortedIndex(callset_pass['2L/variants/POS'])
ann_pass = callset_pass['2L/variants/ANN'][:][['Annotation', 'HGVS_p']]

In [3]:
# setup haplotype data
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])

In [4]:
pos_kdr_s = 2422651
pos_kdr_f = 2422652

In [5]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region


Out[5]:
<SortedIndex shape=(163963,) dtype=int32>
01234...163958163959163960163961163962
4468844691447324473644756...39973723997373399737839973813997386

In [6]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_pass.locate_intersection(pos_phased_region)
np.count_nonzero(loc1)


Out[6]:
163963

In [7]:
ann_phased_region = ann_pass[loc1]
ann_phased_region


Out[7]:
array([(b'intergenic_region', b'.'), (b'intergenic_region', b'.'),
       (b'intergenic_region', b'.'), ...,
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14')])

In [8]:
collections.Counter(ann_phased_region['Annotation'])


Out[8]:
Counter({b'3_prime_UTR_variant': 2941,
         b'5_prime_UTR_premature_start_codon_': 306,
         b'5_prime_UTR_variant': 1677,
         b'downstream_gene_variant': 18539,
         b'initiator_codon_variant': 3,
         b'intergenic_region': 54849,
         b'intragenic_variant': 48,
         b'intron_variant': 32362,
         b'missense_variant': 5805,
         b'missense_variant&splice_region_var': 70,
         b'splice_acceptor_variant&intron_var': 24,
         b'splice_donor_variant&intron_varian': 27,
         b'splice_region_variant': 36,
         b'splice_region_variant&intron_varia': 649,
         b'splice_region_variant&stop_retaine': 5,
         b'splice_region_variant&synonymous_v': 87,
         b'start_lost': 9,
         b'stop_gained': 37,
         b'stop_lost&splice_region_variant': 4,
         b'stop_retained_variant': 5,
         b'synonymous_variant': 8636,
         b'upstream_gene_variant': 37844})

In [9]:
# exclude cross parents
haps_phased_region = genotypes_phased[loc_region].to_haplotypes()[:, :-16].compute()

In [10]:
# perform allele count - needed to locate singletons
ac_phased_region = haps_phased_region.count_alleles(max_allele=1)

Setup data on missense variation


In [11]:
# convenience, define the Vgsc gene region
region_vgsc = SeqFeature('2L', 2358158, 2431617, label='Vgsc')
region_vgsc


Out[11]:
<SeqFeature 'Vgsc' 2L:2358158-2431617>

In [12]:
loc_vgsc = pos_phased_region.locate_range(region_vgsc.start, region_vgsc.end)
loc_vgsc


Out[12]:
slice(24471, 26181, None)

In [13]:
haps_vgsc = haps_phased_region[loc_vgsc]

In [14]:
ac_vgsc = haps_vgsc.count_alleles(max_allele=1)

In [15]:
ann_vgsc = ann_phased_region[loc_vgsc]

In [16]:
loc_vgsc_missense = (ann_vgsc['Annotation'] == b'missense_variant') & (ac_vgsc[:, 1] > 7)
np.count_nonzero(loc_vgsc_missense)


Out[16]:
16

In [17]:
haps_vgsc_missense = haps_vgsc[loc_vgsc_missense]

In [18]:
lbl_vgsc_missense = [l[2:] for l in ann_vgsc[loc_vgsc_missense]['HGVS_p'].astype('U')]
lbl_vgsc_missense


Out[18]:
['Arg254Lys',
 'Asp466His',
 'Thr791Met',
 'Leu995Ser',
 'Leu995Phe',
 'Ala1125Val',
 'Ile1527Thr',
 'Glu1597Gly',
 'Ala1746Ser',
 'Val1853Ile',
 'Ile1868Thr',
 'Pro1874Ser',
 'Pro1874Leu',
 'Phe1920Ser',
 'Ala1934Val',
 'Ile1940Thr']

Split up haplotype data

Here we divide haplotype data up into two, with an "EHH" set containing no singletons and only neutral variants, which we'll use for analysis of haplotype sharing, and a "mut" set containing singletons and putatively non-neutral variants that we will use for analysis of mutations on shared haplotypes.


In [19]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') | 
                    (ann_phased_region['Annotation'] == b'intron_variant') |
                    (ann_phased_region['Annotation'] == b'downstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'upstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'synonymous_variant') |
                    (ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
                    (ann_phased_region['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape


Out[19]:
(156848, (163963,))

In [20]:
# locate singletons - will exclude from EHH analysis
loc_sgl = ac_phased_region.min(axis=1) == 1
loc_nosgl = ac_phased_region.min(axis=1) > 1
np.count_nonzero(loc_sgl), np.count_nonzero(loc_nosgl), loc_nosgl.shape


Out[20]:
(52221, 111611, (163963,))

In [21]:
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl
np.count_nonzero(loc_ehh), loc_ehh.shape


Out[21]:
(107531, (163963,))

In [22]:
# these are the variants to use for mutational distance
#loc_mut = loc_sgl
# include non-neutral mutations
loc_mut = loc_sgl | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape


Out[22]:
(56311, (163963,))

In [23]:
haps_mut = haps_phased_region[loc_mut]

In [24]:
haps_ehh = haps_phased_region[loc_ehh]

In [25]:
pos_ehh = pos_phased_region[loc_ehh]

In [26]:
pos_mut = pos_phased_region[loc_mut]

In [27]:
pos_mut.locate_key(pos_kdr_s)


Out[27]:
10210

In [28]:
pos_mut.locate_key(pos_kdr_f)


Out[28]:
10211

In [29]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()


Out[29]:
label ox_code population label_aug country region sex m_s kt_2la kt_2rb
index
0 AB0085-Ca AB0085-C BFS AB0085-Ca [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
1 AB0085-Cb AB0085-C BFS AB0085-Cb [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
2 AB0087-Ca AB0087-C BFM AB0087-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
3 AB0087-Cb AB0087-C BFM AB0087-Cb [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
4 AB0088-Ca AB0088-C BFM AB0088-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 0.0

Split flanks

Split haplotype data into left and right flanks on the kdr position.


In [30]:
core_pos = pos_kdr_f

In [31]:
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(
    haps_ehh, pos_ehh, core_pos
)

In [32]:
# this gives the distance from the core position to each downstream variant, moving away from the core
dist_ehh_right


Out[32]:
array([     45,     255,     258, ..., 1574721, 1574726, 1574734], dtype=int32)

In [33]:
# this gives the distance from the core position to each upstream variant, moving away from the core
dist_ehh_left


Out[33]:
array([    108,     250,     270, ..., 2377896, 2377920, 2377961], dtype=int32)

In [34]:
dist_ehh_right.shape, dist_ehh_left.shape


Out[34]:
((91777,), (15754,))

In [35]:
dist_ehh_right.min(), dist_ehh_left.min()


Out[35]:
(45, 108)

In [36]:
# split the mutations dataset
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(
    haps_mut, pos_mut, core_pos
)

In [37]:
dist_mut_right


Out[37]:
array([      0,      82,     353, ..., 1574719, 1574720, 1574729], dtype=int32)

In [38]:
dist_mut_left


Out[38]:
array([      1,     166,     505, ..., 2377765, 2377916, 2377964], dtype=int32)

In [39]:
dist_mut_right.shape, dist_mut_left.shape


Out[39]:
((46100,), (10211,))

In [40]:
dist_mut_right.min(), dist_mut_left.min()


Out[40]:
(0, 1)

In [41]:
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape


Out[41]:
((15754, 1530), (91777, 1530), (10211, 1530), (46100, 1530))

In [42]:
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]

Analyse maximal haplotype sharing ("nearest neighbour" analysis based on prefix sorting)

Below "nspl" stands for neighbour shared prefix length (number of SNPs), "nspd" stands for neighbour shared physical distance (bp).


In [43]:
idx_sorted_right, nspl_right, nspd_right, muts_right = hapclust.neighbour_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right
)

In [44]:
idx_sorted_left, nspl_left, nspd_left, muts_left = hapclust.neighbour_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left
)

In [45]:
nspl_right.min(), nspl_right.max()


Out[45]:
(0, 76267)

In [46]:
nspl_left.min(), nspl_left.max()


Out[46]:
(0, 15754)

In [47]:
nspd_right.min(), nspd_right.max()


Out[47]:
(45, 1437047)

In [48]:
nspd_left.min(), nspd_left.max()


Out[48]:
(108, 2377961)

In [49]:
muts_right.min(), muts_right.max()


Out[49]:
(0, 14)

In [50]:
muts_left.min(), muts_left.max()


Out[50]:
(0, 23)

In [51]:
# compute accessible lengths - needed for mutation analysis
nspd_right_accessible = hapclust.haplotype_accessible_length(nspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
nspd_left_accessible = hapclust.haplotype_accessible_length(nspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')

In [52]:
# get some diagnostics on accessibility on left versus right flanks
fig, ax = plt.subplots(figsize=(5, 5))
x = nspd_right
y = nspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = nspd_left
y = nspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Neighbour shared haplotype length (bp)')
fig.tight_layout();



In [53]:
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)


Out[53]:
1e-08

In [54]:
# assume constant recombination rate
rr_right = 1.5e-8
# adjust recombination rate on left flank (factor derived from pairwise analysis below)
rr_left = rr_right * 0.37

# assumed mutation rate
mu_right = 3.5e-9
# adjust mutation rate on left flank (factor derived from pairwise analysis below)
mu_left = mu_right * 0.61

In [55]:
pops_right = df_haplotypes.population[idx_sorted_right]
pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]

fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_right, 
                                         nspd_accessible=nspd_right_accessible,
                                         muts=muts_right, 
                                         haps_display=haps_vgsc_missense[:, idx_sorted_right],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_right,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         muts_ylim=(0, 25),
                                         mu=mu_right, rr=rr_right,
                                         fig=fig)
fig.suptitle('Right flank');



In [56]:
pops_left = df_haplotypes.population[idx_sorted_left]
pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]

fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_left, 
                                         muts=muts_left, 
                                         nspd_accessible=nspd_left_accessible,
                                         haps_display=haps_vgsc_missense[:, idx_sorted_left],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_left,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         muts_ylim=(0, 25),
                                         mu=mu_left, rr=rr_left,
                                         fig=fig)
fig.suptitle('Left flank');



In [57]:
# get some diagnostics on estimates of t hat on left versus right flanks

fig = plt.figure()
ax = fig.add_subplot(111)
x = nspd_left
t_hat = (1 + muts_left) / (2 * (nspd_left * rr_left + nspd_left_accessible * mu_left))
ax.hist(t_hat[(nspd_left > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Upstream', color='b')
ax.axvline(np.median(t_hat), linestyle='--', color='b', lw=2)
t_hat = (1 + muts_right) / (2 * (nspd_right * rr_right + nspd_right_accessible * mu_right))
ax.hist(t_hat[(nspd_right > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Downstream', color='g')
ax.axvline(np.median(t_hat), linestyle='--', color='g', lw=2)
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)')
ax.legend();


Analyse pair-wise haplotype sharing

Here "pspl" stands for pairwise shared prefix length (number of SNPs), "pspd" stands for pairwise shared physical distance (bp).

Setup, diagnostics and tuning


In [58]:
pspl_right, pspd_right, pmuts_right = hapclust.pairwise_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right, jitter=False)

In [59]:
pspl_left, pspd_left, pmuts_left = hapclust.pairwise_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left, jitter=False)

In [60]:
pspl_right.shape, pspl_left.shape


Out[60]:
((1169685,), (1169685,))

In [61]:
pspd_right.min(), pspd_right.max()


Out[61]:
(45, 1437047)

In [62]:
pspd_left.min(), pspd_left.max()


Out[62]:
(108, 2377961)

In [63]:
pspd_right_accessible = hapclust.haplotype_accessible_length(pspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
pspd_left_accessible = hapclust.haplotype_accessible_length(pspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')

In [64]:
# check again accessibility

fig, ax = plt.subplots(figsize=(5, 5))
x = pspd_right
y = pspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = pspd_left
y = pspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Pairwise shared haplotype length')
fig.tight_layout();


Check recombination rate on left versus right flank.


In [65]:
palette = sns.color_palette()
sns.palplot(palette);



In [66]:
# compare shared haplotype length on left versus right flanks

x = pspd_left
y = pspd_right

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e1, 1e7

df = pandas.DataFrame({'x': x, 'y': y})
# linear regression
r = sfa.ols('y ~ x + 0', data=df).fit()
# plot the regression line
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# # summarise the data in bins to provide further visuals 
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=100));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (bp)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params


Out[66]:
x    0.369611
dtype: float64

In [67]:
# compare shared haplotype length after applying adjustment for different 
# recombination rates on left and right flanks

x = pspd_left * rr_left
y = pspd_right * rr_right

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e-7, 1e-1

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (Morgan)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params


Out[67]:
x    0.99895
dtype: float64

In [68]:
# check where the heterochromatin boundary is supposed to be
phase1_ar3.tbl_chromatin


Out[68]:
0|name 1|chrom 2|start 3|stop
CHX X 20009764 24393108
CH2R 2R 58984778 61545105
CH2L 2L 1 2431617
PEU2L 2L 2487770 5042389
IH2L 2L 5078962 5788875

...


In [69]:
# check mutation rate on left versus right flank.

x = pmuts_left / pspd_left_accessible
y = pmuts_right / pspd_right_accessible
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]

lim = 1e-6, 1e-1
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations ($bp^{-1}$)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');


Out[69]:
x    1.629204
dtype: float64

In [70]:
# check mutations on left versus right flank after correcting mutation rate

x = pmuts_left / (mu_left * pspd_left_accessible)
y = pmuts_right / (mu_right * pspd_right_accessible)
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]

lim = 1e2, 1e7
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));


ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations (TODO units?)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');


Out[70]:
x    0.993815
dtype: float64

In [71]:
# now compare estimats of t_hat on left and right flanks, using all adjustments

pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
x = np.log10(pt_hat_left)
y = np.log10(pt_hat_right)

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1, 6.5

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('$\hat{t}$')
ticks = np.arange(lim[0], int(lim[1]) + 1)
ticklabels = ['$10^{%s}$' % t for t in ticks]
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels)
ax.set_yticks(ticks)
ax.set_yticklabels(ticklabels)
r.params


Out[71]:
x    1.00086
dtype: float64

A few diagnostic comparisons between pairwise and earlier neighbour analyses...


In [72]:
pspl_right.max(), nspl_right.max()


Out[72]:
(76267, 76267)

In [73]:
pspd_right.max(), nspd_right.max()


Out[73]:
(1437047, 1437047)

In [74]:
pspl_left.max(), nspl_left.max()


Out[74]:
(15754, 15754)

In [75]:
pspd_left.max(), nspd_left.max()


Out[75]:
(2377961, 2377961)

In [76]:
pmuts_right.shape, pmuts_left.shape


Out[76]:
((1169685,), (1169685,))

In [77]:
muts_right.max(), muts_left.max()


Out[77]:
(14, 23)

In [78]:
pmuts_right.max(), pmuts_left.max()


Out[78]:
(16, 23)

In [79]:
np.bincount(pmuts_right)


Out[79]:
array([482069, 638359,  37050,   8834,   2200,    735,    273,    119,
           23,     11,      1,      2,      1,      2,      4,      1,
            1])

In [80]:
np.bincount(pmuts_left)


Out[80]:
array([639336, 509488,  14927,   3572,   1236,    552,    283,     89,
           43,     59,     32,     18,     13,     28,      7,      1,
            0,      0,      0,      0,      0,      0,      0,      1])

In [81]:
fig, ax = plt.subplots()
ax.hist([pmuts_right, pmuts_left], stacked=True, bins=np.arange(25));


Look at the distribution of ages on left, right and combined flanks...


In [82]:
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
fig, ax = plt.subplots()
ax.hist(pt_hat_right, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');



In [83]:
pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
fig, ax = plt.subplots()
ax.hist(pt_hat_left, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');



In [84]:
pspd_both = pspd_left + pspd_right
pmuts_both = pmuts_left + pmuts_right
psgl_both = pspd_left * rr_left + pspd_right * rr_right
psml_both = pspd_left_accessible * mu_left + pspd_right_accessible * mu_right

pt_hat_both = (1 + pmuts_both) / (2 * (psgl_both + psml_both))
fig, ax = plt.subplots()
ax.hist(pt_hat_both, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');


Dendrogram visualisation and clustering


In [85]:
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average',
                    n_clusters=14):
    """This function plots a dendrogram using scipy and provides some utilities for annotating clusters."""
    
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)

    fig = plt.figure(figsize=(16, 8), )
    gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)

    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    r = scipy.cluster.hierarchy.dendrogram(
        z, no_labels=True, count_sort=True, 
        color_threshold=0, 
        above_threshold_color='k',
        ax=ax)
    ax.set_ylim(*ylim)
    ax.set_yscale(yscale)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
    xticks = xticklabels / len(df_haplotypes)
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')

    cluster_palette = sns.color_palette('Set3', n_colors=12)
    if cut_height:
        ax.axhline(cut_height, linestyle='--', color='k')
        # find clusters
        f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
        # compute cluster sizes
        fsz = np.bincount(f)
        # sort largest first
        fsort = np.argsort(fsz)[::-1]
        # take largest n
        fsort = fsort[:n_clusters]
        # get haplotype indices for each cluster
        clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
        clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster])
                           for cluster in clusters]
        ixs = np.argsort([min(cl) for cl in clusters_leaves])
        clusters = [clusters[i] for i in ixs]
        clusters_leaves = [clusters_leaves[i] for i in ixs]
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
#             ax.axvline(x1*10, color='k', linestyle='--')
#             ax.axvline(x2*10, color='k', linestyle='--')
            ax.fill_between([x1*10, x2*10], 0, cut_height, color=color, alpha=.4, zorder=20)
#             ax.axvspan(x1*10, x2*10, color=color, zorder=-20, alpha=.5)
            ax.text((x1*10 + x2*10) / 2, ylim[0], str(i), ha='center', va='top')
        
    ax = fig.add_subplot(gs[1], )
    sns.despine(ax=ax, left=True, bottom=True)
    pops = df_haplotypes.population[r['leaves']]
    pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
    ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_yticks([])
    ax.set_ylabel('Population', rotation=0, ha='right', va='center')
    ax.set_xticks([])
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)

    ax = fig.add_subplot(gs[2])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_xlabel('Haplotypes')
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)
            ax.axvspan(x1, x2, color=color, zorder=20, alpha=.4)

    gs.tight_layout(fig, h_pad=0)
    return clusters

In [86]:
plot_dendrogram(pt_hat_right);



In [87]:
plot_dendrogram(pt_hat_left);



In [88]:
# see what it looks like on a linear scale
plot_dendrogram(pt_hat_both, yscale='linear', ylim=(-1000, 4e5));



In [89]:
# canonical dendrogram we'll use for cluster analyses
clusters = plot_dendrogram(pt_hat_both, cut_height=2e3)


Cladogram visualisation

Try an alternate visualisation method using our own cladogram implementation.


In [90]:
def plot_cladogram(dist, yscale='symlog', yscale_kws=None, ylim=(10, 1e6), count_sort=True,
                   linkage_method='average', n_clusters=14, fill_threshold=0, leaf_height=0, linewidth=2):
    
    # perform hierarchical clustering
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
    # needed for getting leaves in right order
    r = scipy.cluster.hierarchy.dendrogram(
        z, count_sort=count_sort, no_plot=True)

    # setup figure
    fig = plt.figure(figsize=(16, 8), )
    gs = mpl.gridspec.GridSpec(nrows=2, ncols=1, height_ratios=[6, 4], hspace=0)

    # plot cladogram
    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    colors = [phase1_ar3.pop_colors[p] for p in df_haplotypes.population]
    hapclust.cladogram(z, count_sort=True, colors=colors, 
                       fill_threshold=fill_threshold, leaf_height=leaf_height, 
                       plot_kws=dict(linewidth=linewidth), 
                       fill_kws=dict(linewidth=linewidth), 
                       ax=ax)
    ax.set_ylim(*ylim)
    if yscale_kws is None:
        yscale_kws = dict()
    ax.set_yscale(yscale, **yscale_kws)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(r['leaves']), 200)) + [len(r['leaves'])])
    xticks = xticklabels / len(r['leaves'])
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
    ax.grid(axis='y', linestyle='--', zorder=-1e9)
        
    # plot display haplotypes
    ax = fig.add_subplot(gs[1])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(r['leaves']))
    ax.set_xlabel('Haplotypes')

    gs.tight_layout(fig, h_pad=0)

In [91]:
plot_cladogram(pt_hat_both, fill_threshold=0, linewidth=.5, leaf_height=10,
               ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))



In [92]:
# option to fill clades
plot_cladogram(pt_hat_both, fill_threshold=2e3, linewidth=.5, leaf_height=10,
               ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))


Further analysis of haplotype age


In [93]:
haps_vgsc_missense


Out[93]:
<HaplotypeArray shape=(16, 1530) dtype=int8>
01234...15251526152715281529
000000...00000
100000...00000
200000...00000
......
1300000...00000
1400010...00000
1500000...00000

In [94]:
[len(c) for c in clusters]


Out[94]:
[12, 17, 42, 51, 459, 16, 18, 73, 14, 194, 79, 12, 108, 165]

In [95]:
# pick these out of the clusters we made earlier
f1 = clusters[4]
f2 = clusters[8]
f3 = clusters[3]
f4 = clusters[2]
f5 = clusters[9]
s1 = clusters[12]
s2 = clusters[10]
s3 = clusters[13]
s4 = clusters[7]
l1 = clusters[6]

In [96]:
# rhg = "resistance haplogroups"
rhg = [f1, f2, f3, f4, f5, s1, s2, s3, s4, l1]
rhg_labels = 'F1 F2 F3 F4 F5 S1 S2 S3 S4 L1'.split()

In [97]:
for g, l in zip(rhg, rhg_labels):
    print(l)
    print(collections.Counter(df_haplotypes.population[sorted(g)]).most_common())


F1
[('BFS', 162), ('BFM', 113), ('AOM', 90), ('GNS', 62), ('CMS', 32)]
F2
[('CMS', 14)]
F3
[('CMS', 51)]
F4
[('CMS', 26), ('GAS', 16)]
F5
[('CMS', 170), ('GAS', 24)]
S1
[('UGS', 108)]
S2
[('GAS', 71), ('CMS', 8)]
S3
[('UGS', 98), ('KES', 67)]
S4
[('CMS', 73)]
L1
[('BFM', 18)]

In [98]:
# compare F haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[:5], rhg_labels[:5], palette):
    ixs = allel.condensed_coords_within(g, len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False, 
                 kde_kws=dict(linewidth=2, color=cl))
    ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');



In [99]:
# compare S haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[5:], rhg_labels[5:], palette):
    ixs = allel.condensed_coords_within(g, len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False, 
                 kde_kws=dict(linewidth=2, color=cl))
    ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');



In [100]:
def bootstrap(x, n, f):
    dtype = np.array(f(x)).dtype
    out = np.zeros(n, dtype=dtype)
    for i in range(n):
        ix = np.random.choice(x.shape[0], size=x.shape[0], replace=True)
        out[i] = f(x[ix])
    return out

In [101]:
def distplot_cluster_pop_t_hat(cluster, title):
    cluster = list(cluster)
    pops_clst = df_haplotypes.population[cluster]
    pops_unique = pops_clst.unique()
    pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in pops_unique]

    fig, ax = plt.subplots()
    for g, l in zip(pops_subclusters, pops_unique):
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        color = phase1_ar3.pop_colors[l]
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
        ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
                   color=color, alpha=.3)
        ax.axvline(md, color=color, lw=2, linestyle='--')
    
    ax.set_xlabel('$\hat{t}$')
    ax.set_ylabel('Density')
    ax.set_xlim(1, 4)
    ticks = [1, 2, 3, 4]
    ax.set_xticks(ticks)
    ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
    ax.set_xlabel('$\hat{t}$ (generations)')
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
    fig.tight_layout()

In [102]:
distplot_cluster_pop_t_hat(f1, 'Haplogroup F1')



In [103]:
distplot_cluster_pop_t_hat(f2, 'Haplogroup F2')



In [104]:
distplot_cluster_pop_t_hat(f3, 'Haplogroup F3')



In [105]:
distplot_cluster_pop_t_hat(f4, 'Haplogroup F4')



In [106]:
distplot_cluster_pop_t_hat(f5, 'Haplogroup F5')



In [107]:
distplot_cluster_pop_t_hat(s1, 'Haplogroup S1')



In [108]:
distplot_cluster_pop_t_hat(s2, 'Haplogroup S2')



In [109]:
distplot_cluster_pop_t_hat(s3, 'Haplogroup S3')



In [110]:
distplot_cluster_pop_t_hat(s4, 'Haplogroup S4')



In [111]:
distplot_cluster_pop_t_hat(l1, 'Haplogroup L1')



In [112]:
for i, l in enumerate(lbl_vgsc_missense):
    print(i, l)


0 Arg254Lys
1 Asp466His
2 Thr791Met
3 Leu995Ser
4 Leu995Phe
5 Ala1125Val
6 Ile1527Thr
7 Glu1597Gly
8 Ala1746Ser
9 Val1853Ile
10 Ile1868Thr
11 Pro1874Ser
12 Pro1874Leu
13 Phe1920Ser
14 Ala1934Val
15 Ile1940Thr

In [113]:
cluster = sorted(f1)
haps_missense_clst = haps_vgsc_missense[:, cluster]
haps_missense_clst


Out[113]:
<HaplotypeArray shape=(16, 459) dtype=int8>
01234...454455456457458
000000...00000
100000...00000
200000...00000
......
1300000...00000
1400100...00000
1500000...00000

In [114]:
np.count_nonzero(haps_missense_clst[2, :] == 1)


Out[114]:
32

In [115]:
# compare mutations

ix_muts = [2, 7, 8, 9, 10, 11, 12, 14, 15]
pops_subclusters = [np.array(cluster)[(haps_missense_clst[i, :] == 1)] for i in ix_muts]
# add in "vanilla" haplotypes with only kdr
pops_subclusters.append(np.array(cluster)[np.all(haps_missense_clst[ix_muts, :] == 0, axis=0)])

lbls = ['F1+%s' % lbl_vgsc_missense[i] for i in ix_muts]
lbls.append('F1')
                        
fig, ax = plt.subplots()
palette = sns.color_palette('hls', n_colors=len(ix_muts)+1)
for g, l, color in zip(pops_subclusters, lbls, palette):
    if len(g) > 0:
        log(l, len(g))
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
#     ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
#                color=color, alpha=.3)
    ax.axvline(md, color=color, lw=2, linestyle='--')

ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Density')
ax.set_xlim(1, 3.5)
ticks = [1, 2, 3]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)')
ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
fig.tight_layout()


F1+Thr791Met 32
F1+Glu1597Gly 11
F1+Ala1746Ser 28
F1+Val1853Ile 13
F1+Ile1868Thr 52
F1+Pro1874Ser 26
F1+Pro1874Leu 79
F1+Ala1934Val 16
F1+Ile1940Thr 6
F1 225

In [116]:
def distplot_cluster_pop_between_t_hat(cluster, pop1, pop2, title):
    cluster = list(cluster)
    pops_clst = df_haplotypes.population[cluster]
    pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in [pop1, pop2]]

    fig, ax = plt.subplots()
    for g, l in zip(pops_subclusters, [pop1, pop2]):
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        color = phase1_ar3.pop_colors[l]
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
        ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
                   color=color, alpha=.3)
        ax.axvline(md, color=color, lw=2, linestyle='--')

    ixs = allel.condensed_coords_between(pops_subclusters[0], pops_subclusters[1], len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    color = 'k'
    md = np.median(x)
    bs = bootstrap(x, 1000, np.median)
    lbl = '%s vs %s (%.0f; 95%% CI [%.0f, %.0f])' % (pop1, pop2, 10**md, 
                                           10**np.percentile(bs, 2.5), 
                                           10**np.percentile(bs, 97.5))
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                 kde_kws=dict(linewidth=2, color=color, label=lbl))
    ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
               color=color, alpha=.3)
    ax.axvline(md, color=color, lw=2, linestyle='--')

    ax.set_xlabel('$\hat{t}$')
    ax.set_ylabel('Density')
    ax.set_xlim(1, 4)
    ticks = [1, 2, 3, 4]
    ax.set_xticks(ticks)
    ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
    ax.set_xlabel('$\hat{t}$ (generations)')
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
    fig.tight_layout()

In [117]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'BFM', 'Haplogroup F1')



In [118]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'CMS', 'Haplogroup F1')



In [119]:
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'AOM', 'Haplogroup F1')



In [120]:
distplot_cluster_pop_between_t_hat(f1, 'BFM', 'AOM', 'Haplogroup F1')



In [121]:
distplot_cluster_pop_between_t_hat(f1, 'CMS', 'AOM', 'Haplogroup F1')



In [122]:
distplot_cluster_pop_between_t_hat(f4, 'CMS', 'GAS', 'Haplogroup F4')



In [123]:
distplot_cluster_pop_between_t_hat(f5, 'CMS', 'GAS', 'Haplogroup F5')



In [124]:
distplot_cluster_pop_between_t_hat(s2, 'CMS', 'GAS', 'Haplogroup S2')



In [125]:
distplot_cluster_pop_between_t_hat(s3, 'UGS', 'KES', 'Haplogroup S3')


TODO: break down F1 by population and mutation


In [ ]: