This notebook demonstrates some functions from the hapclust package for analysing haplotype structure using the estimator of allele age from Mathieson and McVean.

Setup

Setup callsets



In [1]:

    
%run setup.ipynb
%matplotlib notebook
# %reload_ext autoreload
# %autoreload 1
# %aimport hapclust
import hapclust



In [2]:

    
# obtain data from unphased callset - only needed for variant annotations
callset_pass = phase1_ar31.callset_pass
pos_pass = allel.SortedIndex(callset_pass['2L/variants/POS'])
ann_pass = callset_pass['2L/variants/ANN'][:][['Annotation', 'HGVS_p']]



In [3]:

    
# setup haplotype data
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])



In [4]:

    
pos_kdr_s = 2422651
pos_kdr_f = 2422652



In [5]:

    
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region









    Out[5]:




<SortedIndex shape=(163963,) dtype=int32>0 1 2 3 4 ... 163958 163959 163960 163961 163962
44688 44691 44732 44736 44756 ... 3997372 3997373 3997378 3997381 3997386



In [6]:

    
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_pass.locate_intersection(pos_phased_region)
np.count_nonzero(loc1)









    Out[6]:





163963



In [7]:

    
ann_phased_region = ann_pass[loc1]
ann_phased_region









    Out[7]:





array([(b'intergenic_region', b'.'), (b'intergenic_region', b'.'),
       (b'intergenic_region', b'.'), ...,
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14')])



In [8]:

    
collections.Counter(ann_phased_region['Annotation'])









    Out[8]:





Counter({b'3_prime_UTR_variant': 2941,
         b'5_prime_UTR_premature_start_codon_': 306,
         b'5_prime_UTR_variant': 1677,
         b'downstream_gene_variant': 18539,
         b'initiator_codon_variant': 3,
         b'intergenic_region': 54849,
         b'intragenic_variant': 48,
         b'intron_variant': 32362,
         b'missense_variant': 5805,
         b'missense_variant&splice_region_var': 70,
         b'splice_acceptor_variant&intron_var': 24,
         b'splice_donor_variant&intron_varian': 27,
         b'splice_region_variant': 36,
         b'splice_region_variant&intron_varia': 649,
         b'splice_region_variant&stop_retaine': 5,
         b'splice_region_variant&synonymous_v': 87,
         b'start_lost': 9,
         b'stop_gained': 37,
         b'stop_lost&splice_region_variant': 4,
         b'stop_retained_variant': 5,
         b'synonymous_variant': 8636,
         b'upstream_gene_variant': 37844})



In [9]:

    
# exclude cross parents
haps_phased_region = genotypes_phased[loc_region].to_haplotypes()[:, :-16].compute()



In [10]:

    
# perform allele count - needed to locate singletons
ac_phased_region = haps_phased_region.count_alleles(max_allele=1)

Setup data on missense variation



In [11]:

    
# convenience, define the Vgsc gene region
region_vgsc = SeqFeature('2L', 2358158, 2431617, label='Vgsc')
region_vgsc









    Out[11]:





<SeqFeature 'Vgsc' 2L:2358158-2431617>



In [12]:

    
loc_vgsc = pos_phased_region.locate_range(region_vgsc.start, region_vgsc.end)
loc_vgsc









    Out[12]:





slice(24471, 26181, None)



In [13]:

    
haps_vgsc = haps_phased_region[loc_vgsc]



In [14]:

    
ac_vgsc = haps_vgsc.count_alleles(max_allele=1)



In [15]:

    
ann_vgsc = ann_phased_region[loc_vgsc]



In [16]:

    
loc_vgsc_missense = (ann_vgsc['Annotation'] == b'missense_variant') & (ac_vgsc[:, 1] > 7)
np.count_nonzero(loc_vgsc_missense)









    Out[16]:





16



In [17]:

    
haps_vgsc_missense = haps_vgsc[loc_vgsc_missense]



In [18]:

    
lbl_vgsc_missense = [l[2:] for l in ann_vgsc[loc_vgsc_missense]['HGVS_p'].astype('U')]
lbl_vgsc_missense









    Out[18]:





['Arg254Lys',
 'Asp466His',
 'Thr791Met',
 'Leu995Ser',
 'Leu995Phe',
 'Ala1125Val',
 'Ile1527Thr',
 'Glu1597Gly',
 'Ala1746Ser',
 'Val1853Ile',
 'Ile1868Thr',
 'Pro1874Ser',
 'Pro1874Leu',
 'Phe1920Ser',
 'Ala1934Val',
 'Ile1940Thr']

Split up haplotype data

Here we divide haplotype data up into two, with an "EHH" set containing no singletons and only neutral variants, which we'll use for analysis of haplotype sharing, and a "mut" set containing singletons and putatively non-neutral variants that we will use for analysis of mutations on shared haplotypes.



In [19]:

    
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') | 
                    (ann_phased_region['Annotation'] == b'intron_variant') |
                    (ann_phased_region['Annotation'] == b'downstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'upstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'synonymous_variant') |
                    (ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
                    (ann_phased_region['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape









    Out[19]:





(156848, (163963,))



In [20]:

    
# locate singletons - will exclude from EHH analysis
loc_sgl = ac_phased_region.min(axis=1) == 1
loc_nosgl = ac_phased_region.min(axis=1) > 1
np.count_nonzero(loc_sgl), np.count_nonzero(loc_nosgl), loc_nosgl.shape









    Out[20]:





(52221, 111611, (163963,))



In [21]:

    
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl
np.count_nonzero(loc_ehh), loc_ehh.shape









    Out[21]:





(107531, (163963,))



In [22]:

    
# these are the variants to use for mutational distance
#loc_mut = loc_sgl
# include non-neutral mutations
loc_mut = loc_sgl | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape









    Out[22]:





(56311, (163963,))



In [23]:

    
haps_mut = haps_phased_region[loc_mut]



In [24]:

    
haps_ehh = haps_phased_region[loc_ehh]



In [25]:

    
pos_ehh = pos_phased_region[loc_ehh]



In [26]:

    
pos_mut = pos_phased_region[loc_mut]



In [27]:

    
pos_mut.locate_key(pos_kdr_s)









    Out[27]:





10210



In [28]:

    
pos_mut.locate_key(pos_kdr_f)









    Out[28]:





10211



In [29]:

    
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()









    Out[29]:







  
    
      
      label
      ox_code
      population
      label_aug
      country
      region
      sex
      m_s
      kt_2la
      kt_2rb
    
    
      index
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      AB0085-Ca
      AB0085-C
      BFS
      AB0085-Ca [Burkina Faso, Pala, S, F]
      Burkina Faso
      Pala
      F
      S
      2.0
      2.0
    
    
      1
      AB0085-Cb
      AB0085-C
      BFS
      AB0085-Cb [Burkina Faso, Pala, S, F]
      Burkina Faso
      Pala
      F
      S
      2.0
      2.0
    
    
      2
      AB0087-Ca
      AB0087-C
      BFM
      AB0087-Ca [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      1.0
    
    
      3
      AB0087-Cb
      AB0087-C
      BFM
      AB0087-Cb [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      1.0
    
    
      4
      AB0088-Ca
      AB0088-C
      BFM
      AB0088-Ca [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      0.0

Split flanks

Split haplotype data into left and right flanks on the kdr position.



In [30]:

    
core_pos = pos_kdr_f



In [31]:

    
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(
    haps_ehh, pos_ehh, core_pos
)



In [32]:

    
# this gives the distance from the core position to each downstream variant, moving away from the core
dist_ehh_right









    Out[32]:





array([     45,     255,     258, ..., 1574721, 1574726, 1574734], dtype=int32)



In [33]:

    
# this gives the distance from the core position to each upstream variant, moving away from the core
dist_ehh_left









    Out[33]:





array([    108,     250,     270, ..., 2377896, 2377920, 2377961], dtype=int32)



In [34]:

    
dist_ehh_right.shape, dist_ehh_left.shape









    Out[34]:





((91777,), (15754,))



In [35]:

    
dist_ehh_right.min(), dist_ehh_left.min()









    Out[35]:





(45, 108)



In [36]:

    
# split the mutations dataset
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(
    haps_mut, pos_mut, core_pos
)



In [37]:

    
dist_mut_right









    Out[37]:





array([      0,      82,     353, ..., 1574719, 1574720, 1574729], dtype=int32)



In [38]:

    
dist_mut_left









    Out[38]:





array([      1,     166,     505, ..., 2377765, 2377916, 2377964], dtype=int32)



In [39]:

    
dist_mut_right.shape, dist_mut_left.shape









    Out[39]:





((46100,), (10211,))



In [40]:

    
dist_mut_right.min(), dist_mut_left.min()









    Out[40]:





(0, 1)



In [41]:

    
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape









    Out[41]:





((15754, 1530), (91777, 1530), (10211, 1530), (46100, 1530))



In [42]:

    
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]

Below "nspl" stands for neighbour shared prefix length (number of SNPs), "nspd" stands for neighbour shared physical distance (bp).



In [43]:

    
idx_sorted_right, nspl_right, nspd_right, muts_right = hapclust.neighbour_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right
)



In [44]:

    
idx_sorted_left, nspl_left, nspd_left, muts_left = hapclust.neighbour_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left
)



In [45]:

    
nspl_right.min(), nspl_right.max()









    Out[45]:





(0, 76267)



In [46]:

    
nspl_left.min(), nspl_left.max()









    Out[46]:





(0, 15754)



In [47]:

    
nspd_right.min(), nspd_right.max()









    Out[47]:





(45, 1437047)



In [48]:

    
nspd_left.min(), nspd_left.max()









    Out[48]:





(108, 2377961)



In [49]:

    
muts_right.min(), muts_right.max()









    Out[49]:





(0, 14)



In [50]:

    
muts_left.min(), muts_left.max()









    Out[50]:





(0, 23)



In [51]:

    
# compute accessible lengths - needed for mutation analysis
nspd_right_accessible = hapclust.haplotype_accessible_length(nspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
nspd_left_accessible = hapclust.haplotype_accessible_length(nspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')



In [52]:

    
# get some diagnostics on accessibility on left versus right flanks
fig, ax = plt.subplots(figsize=(5, 5))
x = nspd_right
y = nspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = nspd_left
y = nspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Neighbour shared haplotype length (bp)')
fig.tight_layout();



In [53]:

    
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)









    Out[53]:





1e-08



In [54]:

    
# assume constant recombination rate
rr_right = 1.5e-8
# adjust recombination rate on left flank (factor derived from pairwise analysis below)
rr_left = rr_right * 0.37

# assumed mutation rate
mu_right = 3.5e-9
# adjust mutation rate on left flank (factor derived from pairwise analysis below)
mu_left = mu_right * 0.61



In [55]:

    
pops_right = df_haplotypes.population[idx_sorted_right]
pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]

fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_right, 
                                         nspd_accessible=nspd_right_accessible,
                                         muts=muts_right, 
                                         haps_display=haps_vgsc_missense[:, idx_sorted_right],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_right,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         muts_ylim=(0, 25),
                                         mu=mu_right, rr=rr_right,
                                         fig=fig)
fig.suptitle('Right flank');



In [56]:

    
pops_left = df_haplotypes.population[idx_sorted_left]
pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]

fig = plt.figure(figsize=(14, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_left, 
                                         muts=muts_left, 
                                         nspd_accessible=nspd_left_accessible,
                                         haps_display=haps_vgsc_missense[:, idx_sorted_left],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_left,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         muts_ylim=(0, 25),
                                         mu=mu_left, rr=rr_left,
                                         fig=fig)
fig.suptitle('Left flank');



In [57]:

    
# get some diagnostics on estimates of t hat on left versus right flanks

fig = plt.figure()
ax = fig.add_subplot(111)
x = nspd_left
t_hat = (1 + muts_left) / (2 * (nspd_left * rr_left + nspd_left_accessible * mu_left))
ax.hist(t_hat[(nspd_left > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Upstream', color='b')
ax.axvline(np.median(t_hat), linestyle='--', color='b', lw=2)
t_hat = (1 + muts_right) / (2 * (nspd_right * rr_right + nspd_right_accessible * mu_right))
ax.hist(t_hat[(nspd_right > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Downstream', color='g')
ax.axvline(np.median(t_hat), linestyle='--', color='g', lw=2)
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)')
ax.legend();

Here "pspl" stands for pairwise shared prefix length (number of SNPs), "pspd" stands for pairwise shared physical distance (bp).

Setup, diagnostics and tuning



In [58]:

    
pspl_right, pspd_right, pmuts_right = hapclust.pairwise_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right, jitter=False)



In [59]:

    
pspl_left, pspd_left, pmuts_left = hapclust.pairwise_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left, jitter=False)



In [60]:

    
pspl_right.shape, pspl_left.shape









    Out[60]:





((1169685,), (1169685,))



In [61]:

    
pspd_right.min(), pspd_right.max()









    Out[61]:





(45, 1437047)



In [62]:

    
pspd_left.min(), pspd_left.max()









    Out[62]:





(108, 2377961)



In [63]:

    
pspd_right_accessible = hapclust.haplotype_accessible_length(pspd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
pspd_left_accessible = hapclust.haplotype_accessible_length(pspd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')



In [64]:

    
# check again accessibility

fig, ax = plt.subplots(figsize=(5, 5))
x = pspd_right
y = pspd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = pspd_left
y = pspd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Pairwise shared haplotype length')
fig.tight_layout();

Check recombination rate on left versus right flank.



In [65]:

    
palette = sns.color_palette()
sns.palplot(palette);



In [66]:

    
# compare shared haplotype length on left versus right flanks

x = pspd_left
y = pspd_right

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e1, 1e7

df = pandas.DataFrame({'x': x, 'y': y})
# linear regression
r = sfa.ols('y ~ x + 0', data=df).fit()
# plot the regression line
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# # summarise the data in bins to provide further visuals 
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=100));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (bp)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params









    














    











    Out[66]:





x    0.369611
dtype: float64



In [67]:

    
# compare shared haplotype length after applying adjustment for different 
# recombination rates on left and right flanks

x = pspd_left * rr_left
y = pspd_right * rr_right

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e-7, 1e-1

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (Morgan)')
ax.set_xscale('log')
ax.set_yscale('log')
r.params









    














    











    Out[67]:





x    0.99895
dtype: float64



In [68]:

    
# check where the heterochromatin boundary is supposed to be
phase1_ar3.tbl_chromatin









    Out[68]:







0|name
1|chrom
2|start
3|stop




CHX
X
20009764
24393108


CH2R
2R
58984778
61545105


CH2L
2L
1
2431617


PEU2L
2L
2487770
5042389


IH2L
2L
5078962
5788875



...



In [69]:

    
# check mutation rate on left versus right flank.

x = pmuts_left / pspd_left_accessible
y = pmuts_right / pspd_right_accessible
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]

lim = 1e-6, 1e-1
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations ($bp^{-1}$)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');









    














    











    Out[69]:





x    1.629204
dtype: float64



In [70]:

    
# check mutations on left versus right flank after correcting mutation rate

x = pmuts_left / (mu_left * pspd_left_accessible)
y = pmuts_right / (mu_right * pspd_right_accessible)
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]

lim = 1e2, 1e7
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor=palette[2], edgecolor='w', linewidth=1, s=60));


ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations (TODO units?)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
r.params
# ax.plot(x, y, marker='o', linestyle=' ', markersize=2, mfc='none');









    














    











    Out[70]:





x    0.993815
dtype: float64



In [71]:

    
# now compare estimats of t_hat on left and right flanks, using all adjustments

pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
x = np.log10(pt_hat_left)
y = np.log10(pt_hat_right)

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1, 6.5

df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
# sns.regplot(x, y, ax=ax, n_boot=10, x_estimator=np.median, x_bins=np.logspace(np.log10(lim[0]), np.log10(lim[1]), 20), fit_reg=False, 
#             scatter_kws=dict(alpha=1, zorder=20, facecolor='r', edgecolor='w', linewidth=1, s=60));

ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('$\hat{t}$')
ticks = np.arange(lim[0], int(lim[1]) + 1)
ticklabels = ['$10^{%s}$' % t for t in ticks]
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels)
ax.set_yticks(ticks)
ax.set_yticklabels(ticklabels)
r.params









    














    











    Out[71]:





x    1.00086
dtype: float64

A few diagnostic comparisons between pairwise and earlier neighbour analyses...



In [72]:

    
pspl_right.max(), nspl_right.max()









    Out[72]:





(76267, 76267)



In [73]:

    
pspd_right.max(), nspd_right.max()









    Out[73]:





(1437047, 1437047)



In [74]:

    
pspl_left.max(), nspl_left.max()









    Out[74]:





(15754, 15754)



In [75]:

    
pspd_left.max(), nspd_left.max()









    Out[75]:





(2377961, 2377961)



In [76]:

    
pmuts_right.shape, pmuts_left.shape









    Out[76]:





((1169685,), (1169685,))



In [77]:

    
muts_right.max(), muts_left.max()









    Out[77]:





(14, 23)



In [78]:

    
pmuts_right.max(), pmuts_left.max()









    Out[78]:





(16, 23)



In [79]:

    
np.bincount(pmuts_right)









    Out[79]:





array([482069, 638359,  37050,   8834,   2200,    735,    273,    119,
           23,     11,      1,      2,      1,      2,      4,      1,
            1])



In [80]:

    
np.bincount(pmuts_left)









    Out[80]:





array([639336, 509488,  14927,   3572,   1236,    552,    283,     89,
           43,     59,     32,     18,     13,     28,      7,      1,
            0,      0,      0,      0,      0,      0,      0,      1])



In [81]:

    
fig, ax = plt.subplots()
ax.hist([pmuts_right, pmuts_left], stacked=True, bins=np.arange(25));

Look at the distribution of ages on left, right and combined flanks...



In [82]:

    
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr_right + pspd_right_accessible * mu_right))
fig, ax = plt.subplots()
ax.hist(pt_hat_right, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');



In [83]:

    
pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr_left + pspd_left_accessible * mu_left))
fig, ax = plt.subplots()
ax.hist(pt_hat_left, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');



In [84]:

    
pspd_both = pspd_left + pspd_right
pmuts_both = pmuts_left + pmuts_right
psgl_both = pspd_left * rr_left + pspd_right * rr_right
psml_both = pspd_left_accessible * mu_left + pspd_right_accessible * mu_right

pt_hat_both = (1 + pmuts_both) / (2 * (psgl_both + psml_both))
fig, ax = plt.subplots()
ax.hist(pt_hat_both, bins=np.logspace(1, 6, 30))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)');

Dendrogram visualisation and clustering



In [85]:

    
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average',
                    n_clusters=14):
    """This function plots a dendrogram using scipy and provides some utilities for annotating clusters."""
    
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)

    fig = plt.figure(figsize=(16, 8), )
    gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)

    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    r = scipy.cluster.hierarchy.dendrogram(
        z, no_labels=True, count_sort=True, 
        color_threshold=0, 
        above_threshold_color='k',
        ax=ax)
    ax.set_ylim(*ylim)
    ax.set_yscale(yscale)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
    xticks = xticklabels / len(df_haplotypes)
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')

    cluster_palette = sns.color_palette('Set3', n_colors=12)
    if cut_height:
        ax.axhline(cut_height, linestyle='--', color='k')
        # find clusters
        f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
        # compute cluster sizes
        fsz = np.bincount(f)
        # sort largest first
        fsort = np.argsort(fsz)[::-1]
        # take largest n
        fsort = fsort[:n_clusters]
        # get haplotype indices for each cluster
        clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
        clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster])
                           for cluster in clusters]
        ixs = np.argsort([min(cl) for cl in clusters_leaves])
        clusters = [clusters[i] for i in ixs]
        clusters_leaves = [clusters_leaves[i] for i in ixs]
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
#             ax.axvline(x1*10, color='k', linestyle='--')
#             ax.axvline(x2*10, color='k', linestyle='--')
            ax.fill_between([x1*10, x2*10], 0, cut_height, color=color, alpha=.4, zorder=20)
#             ax.axvspan(x1*10, x2*10, color=color, zorder=-20, alpha=.5)
            ax.text((x1*10 + x2*10) / 2, ylim[0], str(i), ha='center', va='top')
        
    ax = fig.add_subplot(gs[1], )
    sns.despine(ax=ax, left=True, bottom=True)
    pops = df_haplotypes.population[r['leaves']]
    pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
    ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_yticks([])
    ax.set_ylabel('Population', rotation=0, ha='right', va='center')
    ax.set_xticks([])
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)

    ax = fig.add_subplot(gs[2])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_xlabel('Haplotypes')
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)
            ax.axvspan(x1, x2, color=color, zorder=20, alpha=.4)

    gs.tight_layout(fig, h_pad=0)
    return clusters



In [86]:

    
plot_dendrogram(pt_hat_right);



In [87]:

    
plot_dendrogram(pt_hat_left);



In [88]:

    
# see what it looks like on a linear scale
plot_dendrogram(pt_hat_both, yscale='linear', ylim=(-1000, 4e5));



In [89]:

    
# canonical dendrogram we'll use for cluster analyses
clusters = plot_dendrogram(pt_hat_both, cut_height=2e3)

Cladogram visualisation

Try an alternate visualisation method using our own cladogram implementation.



In [90]:

    
def plot_cladogram(dist, yscale='symlog', yscale_kws=None, ylim=(10, 1e6), count_sort=True,
                   linkage_method='average', n_clusters=14, fill_threshold=0, leaf_height=0, linewidth=2):
    
    # perform hierarchical clustering
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
    # needed for getting leaves in right order
    r = scipy.cluster.hierarchy.dendrogram(
        z, count_sort=count_sort, no_plot=True)

    # setup figure
    fig = plt.figure(figsize=(16, 8), )
    gs = mpl.gridspec.GridSpec(nrows=2, ncols=1, height_ratios=[6, 4], hspace=0)

    # plot cladogram
    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    colors = [phase1_ar3.pop_colors[p] for p in df_haplotypes.population]
    hapclust.cladogram(z, count_sort=True, colors=colors, 
                       fill_threshold=fill_threshold, leaf_height=leaf_height, 
                       plot_kws=dict(linewidth=linewidth), 
                       fill_kws=dict(linewidth=linewidth), 
                       ax=ax)
    ax.set_ylim(*ylim)
    if yscale_kws is None:
        yscale_kws = dict()
    ax.set_yscale(yscale, **yscale_kws)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(r['leaves']), 200)) + [len(r['leaves'])])
    xticks = xticklabels / len(r['leaves'])
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
    ax.grid(axis='y', linestyle='--', zorder=-1e9)
        
    # plot display haplotypes
    ax = fig.add_subplot(gs[1])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(r['leaves']))
    ax.set_xlabel('Haplotypes')

    gs.tight_layout(fig, h_pad=0)



In [91]:

    
plot_cladogram(pt_hat_both, fill_threshold=0, linewidth=.5, leaf_height=10,
               ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))



In [92]:

    
# option to fill clades
plot_cladogram(pt_hat_both, fill_threshold=2e3, linewidth=.5, leaf_height=10,
               ylim=(-10, 1e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))

Further analysis of haplotype age



In [93]:

    
haps_vgsc_missense









    Out[93]:




<HaplotypeArray shape=(16, 1530) dtype=int8> 0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
... ...
13 0 0 0 0 0 ... 0 0 0 0 0
14 0 0 0 1 0 ... 0 0 0 0 0
15 0 0 0 0 0 ... 0 0 0 0 0



In [94]:

    
[len(c) for c in clusters]









    Out[94]:





[12, 17, 42, 51, 459, 16, 18, 73, 14, 194, 79, 12, 108, 165]



In [95]:

    
# pick these out of the clusters we made earlier
f1 = clusters[4]
f2 = clusters[8]
f3 = clusters[3]
f4 = clusters[2]
f5 = clusters[9]
s1 = clusters[12]
s2 = clusters[10]
s3 = clusters[13]
s4 = clusters[7]
l1 = clusters[6]



In [96]:

    
# rhg = "resistance haplogroups"
rhg = [f1, f2, f3, f4, f5, s1, s2, s3, s4, l1]
rhg_labels = 'F1 F2 F3 F4 F5 S1 S2 S3 S4 L1'.split()



In [97]:

    
for g, l in zip(rhg, rhg_labels):
    print(l)
    print(collections.Counter(df_haplotypes.population[sorted(g)]).most_common())









    



F1
[('BFS', 162), ('BFM', 113), ('AOM', 90), ('GNS', 62), ('CMS', 32)]
F2
[('CMS', 14)]
F3
[('CMS', 51)]
F4
[('CMS', 26), ('GAS', 16)]
F5
[('CMS', 170), ('GAS', 24)]
S1
[('UGS', 108)]
S2
[('GAS', 71), ('CMS', 8)]
S3
[('UGS', 98), ('KES', 67)]
S4
[('CMS', 73)]
L1
[('BFM', 18)]



In [98]:

    
# compare F haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[:5], rhg_labels[:5], palette):
    ixs = allel.condensed_coords_within(g, len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False, 
                 kde_kws=dict(linewidth=2, color=cl))
    ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');



In [99]:

    
# compare S haplogroups
fig, ax = plt.subplots()
for g, l, cl in zip(rhg[5:], rhg_labels[5:], palette):
    ixs = allel.condensed_coords_within(g, len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=l, hist=False, 
                 kde_kws=dict(linewidth=2, color=cl))
    ax.axvline(np.median(x), color=cl, linestyle='--', lw=2)
ax.set_ylabel('Density')
ax.set_xlim(1, 4)
ticks = [1, 2, 3, 4]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)');



In [100]:

    
def bootstrap(x, n, f):
    dtype = np.array(f(x)).dtype
    out = np.zeros(n, dtype=dtype)
    for i in range(n):
        ix = np.random.choice(x.shape[0], size=x.shape[0], replace=True)
        out[i] = f(x[ix])
    return out



In [101]:

    
def distplot_cluster_pop_t_hat(cluster, title):
    cluster = list(cluster)
    pops_clst = df_haplotypes.population[cluster]
    pops_unique = pops_clst.unique()
    pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in pops_unique]

    fig, ax = plt.subplots()
    for g, l in zip(pops_subclusters, pops_unique):
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        color = phase1_ar3.pop_colors[l]
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
        ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
                   color=color, alpha=.3)
        ax.axvline(md, color=color, lw=2, linestyle='--')
    
    ax.set_xlabel('$\hat{t}$')
    ax.set_ylabel('Density')
    ax.set_xlim(1, 4)
    ticks = [1, 2, 3, 4]
    ax.set_xticks(ticks)
    ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
    ax.set_xlabel('$\hat{t}$ (generations)')
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
    fig.tight_layout()



In [102]:

    
distplot_cluster_pop_t_hat(f1, 'Haplogroup F1')



In [103]:

    
distplot_cluster_pop_t_hat(f2, 'Haplogroup F2')



In [104]:

    
distplot_cluster_pop_t_hat(f3, 'Haplogroup F3')



In [105]:

    
distplot_cluster_pop_t_hat(f4, 'Haplogroup F4')



In [106]:

    
distplot_cluster_pop_t_hat(f5, 'Haplogroup F5')



In [107]:

    
distplot_cluster_pop_t_hat(s1, 'Haplogroup S1')



In [108]:

    
distplot_cluster_pop_t_hat(s2, 'Haplogroup S2')



In [109]:

    
distplot_cluster_pop_t_hat(s3, 'Haplogroup S3')



In [110]:

    
distplot_cluster_pop_t_hat(s4, 'Haplogroup S4')



In [111]:

    
distplot_cluster_pop_t_hat(l1, 'Haplogroup L1')



In [112]:

    
for i, l in enumerate(lbl_vgsc_missense):
    print(i, l)









    



0 Arg254Lys
1 Asp466His
2 Thr791Met
3 Leu995Ser
4 Leu995Phe
5 Ala1125Val
6 Ile1527Thr
7 Glu1597Gly
8 Ala1746Ser
9 Val1853Ile
10 Ile1868Thr
11 Pro1874Ser
12 Pro1874Leu
13 Phe1920Ser
14 Ala1934Val
15 Ile1940Thr



In [113]:

    
cluster = sorted(f1)
haps_missense_clst = haps_vgsc_missense[:, cluster]
haps_missense_clst









    Out[113]:




<HaplotypeArray shape=(16, 459) dtype=int8> 0 1 2 3 4 ... 454 455 456 457 458
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
... ...
13 0 0 0 0 0 ... 0 0 0 0 0
14 0 0 1 0 0 ... 0 0 0 0 0
15 0 0 0 0 0 ... 0 0 0 0 0



In [114]:

    
np.count_nonzero(haps_missense_clst[2, :] == 1)









    Out[114]:





32



In [115]:

    
# compare mutations

ix_muts = [2, 7, 8, 9, 10, 11, 12, 14, 15]
pops_subclusters = [np.array(cluster)[(haps_missense_clst[i, :] == 1)] for i in ix_muts]
# add in "vanilla" haplotypes with only kdr
pops_subclusters.append(np.array(cluster)[np.all(haps_missense_clst[ix_muts, :] == 0, axis=0)])

lbls = ['F1+%s' % lbl_vgsc_missense[i] for i in ix_muts]
lbls.append('F1')
                        
fig, ax = plt.subplots()
palette = sns.color_palette('hls', n_colors=len(ix_muts)+1)
for g, l, color in zip(pops_subclusters, lbls, palette):
    if len(g) > 0:
        log(l, len(g))
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
#     ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
#                color=color, alpha=.3)
    ax.axvline(md, color=color, lw=2, linestyle='--')

ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Density')
ax.set_xlim(1, 3.5)
ticks = [1, 2, 3]
ax.set_xticks(ticks)
ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
ax.set_xlabel('$\hat{t}$ (generations)')
ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
fig.tight_layout()









    














    











    



F1+Thr791Met 32
F1+Glu1597Gly 11
F1+Ala1746Ser 28
F1+Val1853Ile 13
F1+Ile1868Thr 52
F1+Pro1874Ser 26
F1+Pro1874Leu 79
F1+Ala1934Val 16
F1+Ile1940Thr 6
F1 225



In [116]:

    
def distplot_cluster_pop_between_t_hat(cluster, pop1, pop2, title):
    cluster = list(cluster)
    pops_clst = df_haplotypes.population[cluster]
    pops_subclusters = [np.array(cluster)[(pops_clst == p).values] for p in [pop1, pop2]]

    fig, ax = plt.subplots()
    for g, l in zip(pops_subclusters, [pop1, pop2]):
        ixs = allel.condensed_coords_within(g, len(df_haplotypes))
        x = np.log10(pt_hat_both[ixs])
        color = phase1_ar3.pop_colors[l]
        md = np.median(x)
        bs = bootstrap(x, 1000, np.median)
        lbl = '%s (%.0f; 95%% CI [%.0f, %.0f])' % (l, 10**md, 
                                               10**np.percentile(bs, 2.5), 
                                               10**np.percentile(bs, 97.5))
        sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                     kde_kws=dict(linewidth=2, color=color, label=lbl))
        ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
                   color=color, alpha=.3)
        ax.axvline(md, color=color, lw=2, linestyle='--')

    ixs = allel.condensed_coords_between(pops_subclusters[0], pops_subclusters[1], len(df_haplotypes))
    x = np.log10(pt_hat_both[ixs])
    color = 'k'
    md = np.median(x)
    bs = bootstrap(x, 1000, np.median)
    lbl = '%s vs %s (%.0f; 95%% CI [%.0f, %.0f])' % (pop1, pop2, 10**md, 
                                           10**np.percentile(bs, 2.5), 
                                           10**np.percentile(bs, 97.5))
    sns.distplot(x, bins=np.linspace(1, 3, 20), label=lbl, hist=False, 
                 kde_kws=dict(linewidth=2, color=color, label=lbl))
    ax.axvspan(np.percentile(bs, 2.5), np.percentile(bs, 97.5), 
               color=color, alpha=.3)
    ax.axvline(md, color=color, lw=2, linestyle='--')

    ax.set_xlabel('$\hat{t}$')
    ax.set_ylabel('Density')
    ax.set_xlim(1, 4)
    ticks = [1, 2, 3, 4]
    ax.set_xticks(ticks)
    ax.set_xticklabels(['$10^{%s}$' % t for t in ticks])
    ax.set_xlabel('$\hat{t}$ (generations)')
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper right')
    fig.tight_layout()



In [117]:

    
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'BFM', 'Haplogroup F1')



In [118]:

    
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'CMS', 'Haplogroup F1')



In [119]:

    
distplot_cluster_pop_between_t_hat(f1, 'BFS', 'AOM', 'Haplogroup F1')



In [120]:

    
distplot_cluster_pop_between_t_hat(f1, 'BFM', 'AOM', 'Haplogroup F1')



In [121]:

    
distplot_cluster_pop_between_t_hat(f1, 'CMS', 'AOM', 'Haplogroup F1')



In [122]:

    
distplot_cluster_pop_between_t_hat(f4, 'CMS', 'GAS', 'Haplogroup F4')



In [123]:

    
distplot_cluster_pop_between_t_hat(f5, 'CMS', 'GAS', 'Haplogroup F5')



In [124]:

    
distplot_cluster_pop_between_t_hat(s2, 'CMS', 'GAS', 'Haplogroup S2')



In [125]:

    
distplot_cluster_pop_between_t_hat(s3, 'UGS', 'KES', 'Haplogroup S3')

TODO: break down F1 by population and mutation



In [ ]:

	label	ox_code	population	label_aug	country	region	sex	m_s	kt_2la	kt_2rb
index
0	AB0085-Ca	AB0085-C	BFS	AB0085-Ca [Burkina Faso, Pala, S, F]	Burkina Faso	Pala	F	S	2.0	2.0
1	AB0085-Cb	AB0085-C	BFS	AB0085-Cb [Burkina Faso, Pala, S, F]	Burkina Faso	Pala	F	S	2.0	2.0
2	AB0087-Ca	AB0087-C	BFM	AB0087-Ca [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	1.0
3	AB0087-Cb	AB0087-C	BFM	AB0087-Cb [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	1.0
4	AB0088-Ca	AB0088-C	BFM	AB0088-Ca [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	0.0

0\|name	1\|chrom	2\|start	3\|stop
CHX	X	20009764	24393108
CH2R	2R	58984778	61545105
CH2L	2L	1	2431617
PEU2L	2L	2487770	5042389
IH2L	2L	5078962	5788875

	0	1	2	3	4	...	1525	1526	1527	1528	1529
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	0	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
13	0	0	0	0	0	...	0	0	0	0	0
14	0	0	0	1	0	...	0	0	0	0	0
15	0	0	0	0	0	...	0	0	0	0	0

	0	1	2	3	4	...	454	455	456	457	458
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	0	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
13	0	0	0	0	0	...	0	0	0	0	0
14	0	0	1	0	0	...	0	0	0	0	0
15	0	0	0	0	0	...	0	0	0	0	0

	0	1	2	3	4	...	1525	1526	1527	1528	1529
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	0	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
13	0	0	0	0	0	...	0	0	0	0	0
14	0	0	0	1	0	...	0	0	0	0	0
15	0	0	0	0	0	...	0	0	0	0	0

	0	1	2	3	4	...	454	455	456	457	458
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	0	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
13	0	0	0	0	0	...	0	0	0	0	0
14	0	0	1	0	0	...	0	0	0	0	0
15	0	0	0	0	0	...	0	0	0	0	0