Estimate haplotype ages

this is a notebook of experimental analyses that was not included in the VGSC paper
it will not run without additional input files

Setup



In [1]:

    
%run setup.ipynb
%matplotlib inline
import hapclust

Data setup



In [2]:

    
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']



In [3]:

    
pos_kdr_s = 2422651
pos_kdr_f = 2422652



In [4]:

    
collections.Counter(ann['Annotation'])









    Out[4]:





Counter({b'3_prime_UTR_variant': 5987,
         b'5_prime_UTR_premature_start_codon_': 494,
         b'5_prime_UTR_variant': 2971,
         b'downstream_gene_variant': 41916,
         b'initiator_codon_variant': 5,
         b'intergenic_region': 102204,
         b'intragenic_variant': 48,
         b'intron_variant': 77277,
         b'missense_variant': 12387,
         b'missense_variant&splice_region_var': 160,
         b'splice_acceptor_variant&intron_var': 45,
         b'splice_donor_variant&intron_varian': 50,
         b'splice_region_variant': 70,
         b'splice_region_variant&intron_varia': 1448,
         b'splice_region_variant&stop_retaine': 11,
         b'splice_region_variant&synonymous_v': 207,
         b'start_lost': 15,
         b'stop_gained': 85,
         b'stop_gained&splice_region_variant': 1,
         b'stop_lost': 2,
         b'stop_lost&splice_region_variant': 9,
         b'stop_retained_variant': 9,
         b'synonymous_variant': 19507,
         b'upstream_gene_variant': 77090})



In [5]:

    
# perform allele count - needed to locate singletons  
ac = haps.count_alleles(max_allele=3)
ac[:1]









    Out[5]:




<AlleleCountsArray shape=(1, 4) dtype=int32> 0 1 2 3
0 1529    1    0    0

Setup missense variation



In [6]:

    
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
tbl_variants_selected









    Out[6]:







0|CHROM
1|POS
2|num_alleles
3|REF
4|ALT
5|AC
6|ALTIX
7|FILTER_PASS
8|NoCoverage
9|LowCoverage
10|HighCoverage
11|LowMQ
12|HighMQ0
13|RepeatDUST
14|RepeatMasker
15|RepeatTRF
16|FS
17|HRun
18|QD
19|ReadPosRankSum
20|SNPEFF_Allele
21|SNPEFF_Annotation
22|SNPEFF_HGVS_c
23|SNPEFF_HGVS_p
24|SNPEFF_Feature_ID
25|SNPEFF_CDS_pos
26|AF_AOM
27|AF_BFM
28|AF_GWA
29|AF_GNS
30|AF_BFS
31|AF_CMS
32|AF_GAS
33|AF_UGS
34|AF_KES
35|check_allele
36|exon_start
37|exon_end
38|exon
39|AGAP004707-RA
40|AGAP004707-RB
41|AGAP004707-RC
42|Davies-C1N2
43|Davies-C3N2
44|Davies-C5N2
45|Davies-C7N2
46|Davies-C8N2
47|Davies-C10N2
48|Davies-C11N2
49|Davies-C1N9
50|Davies-C8N9
51|Davies-C1N9ck




2L
2390177
2
G
A
198
0
True
0
3
8
0
0
False
False
False
0.12695
1
18.625
0.83496
A
missense_variant
n.761G>A
p.Arg254Lys
AGAP004707-RA
761
0.0
0.0
0.0
0.0
0.0
0.316363636364
0.214285714286
0.0
0.0
True
2390129
2390341
7
R254K
R254K
R254K
R260K
R217K
R260K
R260K
R271K
R260K
R260K
R260K
R271K
R260K


2L
2391228
3
G
C
10
0
True
0
0
12
0
0
False
False
False
2.0352
0
14.867
-1.1777
C
missense_variant
n.1204G>C
p.Val402Leu
AGAP004707-RA
1204
0.0
0.0724637681159
0.0
0.0
0.0
0.0
0.0
0.0
0.0
True
2391156
2391320
10
V402L
V402L
V402L
V408L
V365L

V408L
V419L
V408L
V408L
V408L
V419L
V408L


2L
2391228
3
G
T
9
1
True
0
0
12
0
0
False
False
False
2.0352
0
14.867
-1.1777
None
None
None
None
None
None
0.0
0.0652173913043
0.0
0.0
0.0
0.0
0.0
0.0
0.0
True
2391156
2391320
10
V402L
V402L
V402L
V408L
V365L

V408L
V419L
V408L
V408L
V408L
V419L
V408L


2L
2399997
2
G
C
38
0
True
0
1
7
0
0
False
False
False
13.359
0
15.688
0.11798
C
missense_variant
n.1396G>C
p.Asp466His
AGAP004707-RA
1396
0.0
0.0
0.0
0.0
0.0
0.0690909090909
0.0
0.0
0.0
True
2399898
2400173
11i+
D466H
D466H
D466H
D472H
D429H
D417H
D472H
D483H
D472H
D472H
D472H
D483H
D472H


2L
2400071
3
G
A
16
0
True
0
0
8
0
0
False
False
False
5.6875
0
16.969
1.3232
A
missense_variant
n.1470G>A
p.Met490Ile
AGAP004707-RA
1470
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.181818181818
True
2399898
2400173
11i+
M490I
M490I
M490I
M496I
M453I
M441I
M496I
M507I
M496I
M496I
M496I
M507I
M496I



...



In [7]:

    
tbl_selected_redux = (
    tbl_variants_selected
    .cut('POS', 'REF', 'ALT', 'AGAP004707-RA')
    .mergeduplicates(key=('POS'))
    .convert('ALT', lambda v: ','.join(v) if len(v) > 1 else v)
    .addfield('label',  lambda rec: '%s:%s>%s %s' % (rec.POS, rec.REF, rec.ALT.ljust(3), rec['AGAP004707-RA'].rjust(6)))
    .sort('POS')
)
tbl_selected_redux.display(vrepr=repr)









    







0|POS
1|REF
2|ALT
3|AGAP004707-RA
4|label




2390177
'G'
'A'
'R254K'
'2390177:G>A    R254K'


2391228
'G'
'C,T'
'V402L'
'2391228:G>C,T  V402L'


2399997
'G'
'C'
'D466H'
'2399997:G>C    D466H'


2400071
'G'
'A,T'
'M490I'
'2400071:G>A,T  M490I'


2416980
'C'
'T'
'T791M'
'2416980:C>T    T791M'



...



In [8]:

    
lbl_vgsc_missense = tbl_selected_redux.values('label').list()
lbl_vgsc_missense









    Out[8]:





['2390177:G>A    R254K',
 '2391228:G>C,T  V402L',
 '2399997:G>C    D466H',
 '2400071:G>A,T  M490I',
 '2416980:C>T    T791M',
 '2422651:T>C    L995S',
 '2422652:A>T    L995F',
 '2424384:C>T   A1125V',
 '2425077:G>A   V1254I',
 '2429617:T>C   I1527T',
 '2429745:A>T   N1570Y',
 '2429897:A>G   E1597G',
 '2429915:A>C   K1603T',
 '2430424:G>T   A1746S',
 '2430817:G>A   V1853I',
 '2430863:T>C   I1868T',
 '2430880:C>T   P1874S',
 '2430881:C>T   P1874L',
 '2431019:T>C   F1920S',
 '2431061:C>T   A1934V',
 '2431079:T>C   I1940T']



In [9]:

    
pos_selected = tbl_selected_redux.values('POS').array()
pos_selected









    Out[9]:





array([2390177, 2391228, 2399997, 2400071, 2416980, 2422651, 2422652,
       2424384, 2425077, 2429617, 2429745, 2429897, 2429915, 2430424,
       2430817, 2430863, 2430880, 2430881, 2431019, 2431061, 2431079], dtype=int32)



In [10]:

    
loc_vgsc_missense = pos.locate_keys(pos_selected)
np.count_nonzero(loc_vgsc_missense)









    Out[10]:





21



In [11]:

    
haps_vgsc_missense = haps[loc_vgsc_missense]
haps_vgsc_missense









    Out[11]:




<HaplotypeArray shape=(21, 1530) dtype=int8> 0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 2 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
... ...
18 0 0 0 0 0 ... 0 0 0 0 0
19 0 0 0 1 0 ... 0 0 0 0 0
20 0 0 0 0 0 ... 0 0 0 0 0

Split up haplotype data into neutral/mutation

Here we divide haplotype data up into two, with an "EHH" set containing no singletons and only neutral biallelic variants, which we'll use for analysis of haplotype sharing, and a "mut" set containing singletons and putatively non-neutral bi-allelic variants that we will use for analysis of mutations on shared haplotypes.



In [12]:

    
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann['Annotation'] == b'intergenic_region') | 
                    (ann['Annotation'] == b'intron_variant') |
                    (ann['Annotation'] == b'downstream_gene_variant') |
                    (ann['Annotation'] == b'upstream_gene_variant') |
                    (ann['Annotation'] == b'synonymous_variant') |
                    (ann['Annotation'] == b'3_prime_UTR_variant') |
                    (ann['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape









    Out[12]:





(326952, (341998,))



In [13]:

    
# locate singletons - will exclude from EHH analysis
# NB the EHH analysis doesn't need the multiallelics 
loc_sgl_bi = (ac[:, :2].min(axis=1) == 1) & (ac.is_biallelic_01())
loc_nosgl_bi = (ac[:, :2].min(axis=1) > 1) & (ac.is_biallelic_01())
np.count_nonzero(loc_sgl_bi), np.count_nonzero(loc_nosgl_bi), loc_nosgl_bi.shape









    Out[13]:





(107220, 234555, (341998,))



In [14]:

    
# these are the variants to use for EHH 
loc_ehh = loc_type_neutral & loc_nosgl_bi
np.count_nonzero(loc_ehh), loc_ehh.shape









    Out[14]:





(225790, (341998,))



In [15]:

    
# these are the variants to use for mutational distance
# include non-neutral mutations
loc_mut = loc_sgl_bi | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape









    Out[15]:





(116002, (341998,))



In [16]:

    
haps_mut = haps[loc_mut]
pos_mut = pos[loc_mut]



In [17]:

    
haps_mut









    Out[17]:




<HaplotypeArray shape=(116002, 1530) dtype=int8> 0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
... ...
115999 0 0 0 0 0 ... 0 0 0 0 0
116000 0 0 0 0 0 ... 0 0 0 0 0
116001 0 0 0 0 0 ... 0 0 0 0 0



In [18]:

    
haps_ehh = haps[loc_ehh]
pos_ehh = pos[loc_ehh]



In [19]:

    
#check
pos_mut.locate_key(pos_kdr_s), pos_mut.locate_key(pos_kdr_f)









    Out[19]:





(10212, 10213)



In [20]:

    
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()









    Out[20]:







  
    
      
      label
      ox_code
      population
      label_aug
      country
      region
      sex
      m_s
      kt_2la
      kt_2rb
    
    
      index
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      AB0085-Ca
      AB0085-C
      BFS
      AB0085-Ca [Burkina Faso, Pala, S, F]
      Burkina Faso
      Pala
      F
      S
      2.0
      2.0
    
    
      1
      AB0085-Cb
      AB0085-C
      BFS
      AB0085-Cb [Burkina Faso, Pala, S, F]
      Burkina Faso
      Pala
      F
      S
      2.0
      2.0
    
    
      2
      AB0087-Ca
      AB0087-C
      BFM
      AB0087-Ca [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      1.0
    
    
      3
      AB0087-Cb
      AB0087-C
      BFM
      AB0087-Cb [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      1.0
    
    
      4
      AB0088-Ca
      AB0088-C
      BFM
      AB0088-Ca [Burkina Faso, Bana, M, F]
      Burkina Faso
      Bana
      F
      M
      2.0
      0.0

Split flanks

Split haplotype data into L/R flanks on kdr position.



In [21]:

    
# set core SNP
core_pos = pos_kdr_f



In [22]:

    
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(haps_ehh, pos_ehh, core_pos)



In [23]:

    
# these give the distance from the core position to each downstream variant, moving away from the core
dist_ehh_right









    Out[23]:





array([     45,     255,     258, ..., 3576600, 3576601, 3576602], dtype=int32)



In [24]:

    
dist_ehh_right.shape, dist_ehh_left.shape, dist_ehh_right.min(), dist_ehh_left.min()









    Out[24]:





((210036,), (15754,), 45, 108)



In [25]:

    
# split the mutations dataset
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(haps_mut, pos_mut, core_pos)



In [26]:

    
dist_mut_right









    Out[26]:





array([      0,      82,     353, ..., 3576463, 3576534, 3576540], dtype=int32)



In [27]:

    
dist_mut_right.shape, dist_mut_left.shape, dist_mut_right.min(), dist_mut_left.min()









    Out[27]:





((105789,), (10213,), 0, 1)



In [28]:

    
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape









    Out[28]:





((15754, 1530), (210036, 1530), (10213, 1530), (105789, 1530))



In [29]:

    
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]

nn_spl = neighbour sharing prefix length
nn_spd = neighbour shared physical distance



In [30]:

    
nn_idx_sorted_right, nn_spl_right, nn_spd_right, nn_muts_right = hapclust.neighbour_haplotype_sharing(haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right)



In [31]:

    
nn_idx_sorted_left, nn_spl_left, nn_spd_left, nn_muts_left = hapclust.neighbour_haplotype_sharing(haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left)



In [32]:

    
nn_spl_right.min(), nn_spl_right.max()









    Out[32]:





(0, 89166)



In [33]:

    
nn_spd_right.min(), nn_spd_right.max(), nn_spd_left.min(), nn_spd_left.max()









    Out[33]:





(45, 1544571, 108, 2377961)



In [34]:

    
nn_muts_right.min(), nn_muts_right.max(), nn_muts_left.min(), nn_muts_left.max()









    Out[34]:





(0, 14, 0, 15)



In [35]:

    
# compute accessible lengths - needed for mutation analysis
nn_spd_right_accessible = hapclust.haplotype_accessible_length(nn_spd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
nn_spd_left_accessible = hapclust.haplotype_accessible_length(nn_spd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')



In [36]:

    
def nn_accessibility_diagnostics():

    # get some diagnostics on accessibility on left versus right flanks
    fig, ax = plt.subplots(figsize=(8, 8))
    x = nn_spd_right
    y = nn_spd_right_accessible
    ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
    x = nn_spd_left
    y = nn_spd_left_accessible
    ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
    ax.set_xlabel('Physical length')
    ax.set_ylabel('Accessible length')
    lim = 0, x.max()
    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, 'k--')
    ax.legend()
    ax.set_title('Neighbour shared haplotype length (bp)')
    fig.tight_layout()
    

nn_accessibility_diagnostics()



In [37]:

    
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)









    Out[37]:





1e-08



In [38]:

    
# assume constant recombination rate
rr_right = 1.5e-8
# adjust recombination rate on left flank (factor derived from pairwise analysis below)
rr_left = rr_right * 0.37

# assumed mutation rate
mu_right = 3.5e-9
# adjust mutation rate on left flank (factor derived from pairwise analysis below)
mu_left = mu_right * 0.61



In [39]:

    
def plot_nn_right_flank():

    pops_right = df_haplotypes.population[nn_idx_sorted_right]
    pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]

    fig = plt.figure(figsize=(20, 10))
    hapclust.fig_neighbour_haplotype_sharing(nspd=nn_spd_right, 
                                             nspd_accessible=nn_spd_right_accessible,
                                             muts=nn_muts_right, 
                                             haps_display=haps_vgsc_missense[:, nn_idx_sorted_right],
                                             haps_display_vlbl=lbl_vgsc_missense,
                                             pop_colors=pop_colors_right,
                                             nspd_cut=2e4,
                                             nspd_ylim=(1e3, 3e6),
                                             that_ylim=(2e1, 5e5),
                                             muts_ylim=(0, 25),
                                             mu=mu_right, rr=rr_right,
                                             fig=fig)
    fig.suptitle('Right flank')
    
    
plot_nn_right_flank()



In [40]:

    
def plot_nn_left_flank():

    pops_left = df_haplotypes.population[nn_idx_sorted_left]
    pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]

    fig = plt.figure(figsize=(20, 10))
    hapclust.fig_neighbour_haplotype_sharing(nspd=nn_spd_left, 
                                             muts=nn_muts_left, 
                                             nspd_accessible=nn_spd_left_accessible,
                                             haps_display=haps_vgsc_missense[:, nn_idx_sorted_left],
                                             haps_display_vlbl=lbl_vgsc_missense,
                                             pop_colors=pop_colors_left,
                                             nspd_cut=2e4,
                                             nspd_ylim=(1e3, 3e6),
                                             that_ylim=(2e1, 5e5),
                                             muts_ylim=(0, 25),
                                             mu=mu_left, rr=rr_left,
                                             fig=fig)
    fig.suptitle('Left flank')
    
    
plot_nn_left_flank()



In [41]:

    
def nn_t_hat_diagnostics():

    # get some diagnostics on estimates of t hat on left versus right flanks

    fig = plt.figure()
    ax = fig.add_subplot(111)
    x = nn_spd_left
    t_hat = (1 + nn_muts_left) / (2 * (nn_spd_left * rr_left + nn_spd_left_accessible * mu_left))
    ax.hist(t_hat[(nn_spd_left > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Upstream', color='b')
    ax.axvline(np.median(t_hat), linestyle='--', color='b', lw=2)
    t_hat = (1 + nn_muts_right) / (2 * (nn_spd_right * rr_right + nn_spd_right_accessible * mu_right))
    ax.hist(t_hat[(nn_spd_right > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Downstream', color='g')
    ax.axvline(np.median(t_hat), linestyle='--', color='g', lw=2)
    ax.set_xscale('log')
    ax.set_xlabel('$\hat{t}$')
    ax.set_ylabel('Frequency (no. pairs)')
    ax.legend()
    
    
nn_t_hat_diagnostics()

pw_spl = pairwise shared prefix length (number of SNPs)
pw_spd = pairwise shared physical distance (bp)

Setup, diagnostics and tuning



In [42]:

    
pw_spl_right, pw_spd_right, pw_muts_right = hapclust.pairwise_haplotype_sharing(haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right, jitter=False)



In [43]:

    
pw_spl_left, pw_spd_left, pw_muts_left = hapclust.pairwise_haplotype_sharing(haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left, jitter=False)



In [44]:

    
pw_spl_right.shape, pw_spl_left.shape









    Out[44]:





((1169685,), (1169685,))



In [45]:

    
pw_spd_right.min(), pw_spd_right.max(), pw_spd_left.min(), pw_spd_left.max()









    Out[45]:





(45, 1544571, 108, 2377961)



In [46]:

    
pw_spd_right_accessible = hapclust.haplotype_accessible_length(pw_spd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
pw_spd_left_accessible = hapclust.haplotype_accessible_length(pw_spd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')



In [47]:

    
def pw_accessibility_diagnostics():


    # check again accessibility

    fig, ax = plt.subplots(figsize=(8, 8))
    x = pw_spd_right
    y = pw_spd_right_accessible
    ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
    x = pw_spd_left
    y = pw_spd_left_accessible
    ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
    ax.set_xlabel('Physical length')
    ax.set_ylabel('Accessible length')
    lim = 0, x.max()
    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, 'k--')
    ax.legend()
    ax.set_title('Pairwise shared haplotype length')
    fig.tight_layout()
    
    
pw_accessibility_diagnostics()

check recombination rate on left vs. right flank



In [48]:

    
palette = sns.color_palette()
sns.palplot(palette);



In [49]:

    
def pw_haplength_diagnostics():

    # compare shared haplotype length on left versus right flanks

    x = pw_spd_left
    y = pw_spd_right

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
    lim = 1e1, 1e7

    df = pandas.DataFrame({'x': x, 'y': y})
    # linear regression
    r = sfa.ols('y ~ x + 0', data=df).fit()
    # plot the regression line
    ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)

    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, 'k--')
    ax.set_xlabel('Left flank')
    ax.set_ylabel('Right flank')
    ax.set_title('Shared haplotype length (bp)')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    return r.params


pw_haplength_diagnostics()









    Out[49]:





x    0.369112
dtype: float64



In [50]:

    
def pw_haplength_diagnostics_adjusted():


    # compare shared haplotype length after applying adjustment for different 
    # recombination rates on left and right flanks

    x = pw_spd_left * rr_left
    y = pw_spd_right * rr_right

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
    lim = 1e-7, 1e-1

    df = pandas.DataFrame({'x': x, 'y': y})
    r = sfa.ols('y ~ x + 0', data=df).fit()
    ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)

    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, 'k--')
    ax.set_xlabel('Left flank')
    ax.set_ylabel('Right flank')
    ax.set_title('Shared haplotype length (Morgan)')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    return r.params


pw_haplength_diagnostics_adjusted()









    Out[50]:





x    0.997601
dtype: float64



In [51]:

    
# check where the heterochromatin boundary is supposed to be
phase1_ar3.tbl_chromatin









    Out[51]:







0|name
1|chrom
2|start
3|stop




CHX
X
20009764
24393108


CH2R
2R
58984778
61545105


CH2L
2L
1
2431617


PEU2L
2L
2487770
5042389


IH2L
2L
5078962
5788875



...



In [52]:

    
def pw_mutation_diagnostics():

    # check mutation rate on left versus right flank.

    x = pw_muts_left / pw_spd_left_accessible
    y = pw_muts_right / pw_spd_right_accessible
    tst = (x > 0) & (y > 0)
    x = x[tst]
    y = y[tst]

    lim = 1e-6, 1e-1
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.despine(ax=ax, offset=5)
    ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

    df = pandas.DataFrame({'x': x, 'y': y})
    r = sfa.ols('y ~ x + 0', data=df).fit()
    ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)

    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, linestyle='--', color='k')
    ax.set_title('Mutations ($bp^{-1}$)')
    ax.set_xlabel('Left flank')
    ax.set_ylabel('Reft flank')

    return r.params


pw_mutation_diagnostics()









    Out[52]:





x    1.630348
dtype: float64



In [53]:

    
def pw_mutation_diagnostics_adjusted():


    # check mutations on left versus right flank after correcting mutation rate

    x = pw_muts_left / (mu_left * pw_spd_left_accessible)
    y = pw_muts_right / (mu_right * pw_spd_right_accessible)
    tst = (x > 0) & (y > 0)
    x = x[tst]
    y = y[tst]

    lim = 1e2, 1e7
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.despine(ax=ax, offset=5)
    ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')

    df = pandas.DataFrame({'x': x, 'y': y})
    r = sfa.ols('y ~ x + 0', data=df).fit()
    ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)

    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, linestyle='--', color='k')
    ax.set_title('Mutations (TODO units?)')
    ax.set_xlabel('Left flank')
    ax.set_ylabel('Reft flank')
    return r.params


pw_mutation_diagnostics_adjusted()









    Out[53]:





x    0.994513
dtype: float64



In [54]:

    
def pw_t_hat_diagnostics():

    # now compare estimats of t_hat on left and right flanks, using all adjustments

    pw_t_hat_left = (1 + pw_muts_left) / (2 * (pw_spd_left * rr_left + pw_spd_left_accessible * mu_left))
    pw_t_hat_right = (1 + pw_muts_right) / (2 * (pw_spd_right * rr_right + pw_spd_right_accessible * mu_right))
    x = np.log10(pw_t_hat_left)
    y = np.log10(pw_t_hat_right)

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
    lim = 1, 6.5

    df = pandas.DataFrame({'x': x, 'y': y})
    r = sfa.ols('y ~ x + 0', data=df).fit()
    ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)

    ax.set_xlim(*lim)
    ax.set_ylim(*lim)
    ax.plot(lim, lim, 'k--')
    ax.set_xlabel('Left flank')
    ax.set_ylabel('Right flank')
    ax.set_title('$\hat{t}$')
    ticks = np.arange(lim[0], int(lim[1]) + 1)
    ticklabels = ['$10^{%s}$' % t for t in ticks]
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticklabels)
    ax.set_yticks(ticks)
    ax.set_yticklabels(ticklabels)
    
    return r.params


pw_t_hat_diagnostics()









    Out[54]:





x    1.001761
dtype: float64



In [55]:

    
def compute_combined_t_hat():
    global pw_t_hat_both

    # combined both flanks
    pw_muts_both = pw_muts_left + pw_muts_right
    pw_spd_both = pw_spd_left + pw_spd_right
    pw_sgl_both = pw_spd_left * rr_left + pw_spd_right * rr_right
    pw_sml_both = pw_spd_left_accessible * mu_left + pw_spd_right_accessible * mu_right
    pw_t_hat_both = (1 + pw_muts_both) / (2 * (pw_sgl_both + pw_sml_both))
    
    
compute_combined_t_hat()



In [56]:

    
np.savez_compressed('../data/pairwise_haplotype_age.npz', 
                    t_hat=pw_t_hat_both)



In [57]:

    
!ls -lh ../data/pairwise_haplotype_age.npz









    



-rw-r--r-- 1 aliman aliman 972K Sep 11 12:52 ../data/pairwise_haplotype_age.npz

Dendrogram visualisation and clustering



In [58]:

    
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average',
                    n_clusters=14):
    """This function plots a dendrogram using scipy and provides some utilities for annotating clusters."""
    
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)

    fig = plt.figure(figsize=(16, 8), )
    gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)

    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    r = scipy.cluster.hierarchy.dendrogram(
        z, no_labels=True, count_sort=True, 
        color_threshold=0, 
        above_threshold_color='k',
        ax=ax)
    ax.set_ylim(*ylim)
    ax.set_yscale(yscale)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
    xticks = xticklabels / len(df_haplotypes)
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')

    cluster_palette = sns.color_palette('Set3', n_colors=12)
    if cut_height:
        ax.axhline(cut_height, linestyle='--', color='k')
        # find clusters
        f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
        # compute cluster sizes
        fsz = np.bincount(f)
        # sort largest first
        fsort = np.argsort(fsz)[::-1]
        # take largest n
        fsort = fsort[:n_clusters]
        # get haplotype indices for each cluster
        clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
        clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster])
                           for cluster in clusters]
        ixs = np.argsort([min(cl) for cl in clusters_leaves])
        clusters = [clusters[i] for i in ixs]
        clusters_leaves = [clusters_leaves[i] for i in ixs]
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
#             ax.axvline(x1*10, color='k', linestyle='--')
#             ax.axvline(x2*10, color='k', linestyle='--')
            ax.fill_between([x1*10, x2*10], 0, cut_height, color=color, alpha=.4, zorder=20)
#             ax.axvspan(x1*10, x2*10, color=color, zorder=-20, alpha=.5)
            ax.text((x1*10 + x2*10) / 2, ylim[0], str(i), ha='center', va='top')
        
    ax = fig.add_subplot(gs[1], )
    sns.despine(ax=ax, left=True, bottom=True)
    pops = df_haplotypes.population[r['leaves']]
    pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
    ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_yticks([])
    ax.set_ylabel('Population', rotation=0, ha='right', va='center')
    ax.set_xticks([])
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)

    ax = fig.add_subplot(gs[2])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_xlabel('Haplotypes')
    if cut_height:
        for i, cluster_leaves in enumerate(clusters_leaves):
            color = cluster_palette[i % len(cluster_palette)]
            x1, x2 = min(cluster_leaves), max(cluster_leaves)
            ax.axvline(x1, color='k', linestyle='--', zorder=20)
            ax.axvline(x2, color='k', linestyle='--', zorder=20)
            ax.axvspan(x1, x2, color=color, zorder=20, alpha=.4)

    gs.tight_layout(fig, h_pad=0)
    return clusters



In [59]:

    
# canonical dendrogram we'll use for cluster analyses
clusters = plot_dendrogram(pw_t_hat_both, cut_height=2e3)



In [60]:

    
len(clusters)









    Out[60]:





14



In [61]:

    
cluster_names = {
    1 : 'F4',
    2 : 'F3',
    3 : 'F1',
    4 : 'L2',
    5 : 'L1',
    7 : 'S4/5',
    8 : 'F2',
    9 : 'F5',
    10 : 'S2',
    12 : 'S1',
    13 : 'S3'
}

Output the haplogroup indices for use elsewhere



In [62]:

    
clust_dict = {
    n: clusters[k] for k, n in cluster_names.items()
}
sorted(clust_dict)









    Out[62]:





['F1', 'F2', 'F3', 'F4', 'F5', 'L1', 'L2', 'S1', 'S2', 'S3', 'S4/5']



In [63]:

    
# save as pickle
import pickle
with open('../data/clust_dict.pickle', 'wb') as handle:
    pickle.dump(clust_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [64]:

    
# check pickle
with open('../data/clust_dict.pickle', 'rb') as handle:
    b = pickle.load(handle)

print(clust_dict == b)









    



True



In [65]:

    
# make list of all indices (values) and their cluster names (keys) in clusters_good 
all_idx = []
all_clu = []
for n, s in clust_dict.items():
    for i in sorted(s):
        all_clu.append(n)
        all_idx.append(i)
        
len(all_idx), len(all_clu)









    Out[65]:





(1220, 1220)



In [66]:

    
# write out text file
clus_df = pandas.DataFrame()
clus_df['cluster'] = all_clu
clus_df['haplotype'] = all_idx
clus_df.to_csv('../data/clusters.csv', index=False)

0\|CHROM	1\|POS	2\|num_alleles	3\|REF	4\|ALT	5\|AC	6\|ALTIX	7\|FILTER_PASS	9\|LowCoverage	10\|HighCoverage	13\|RepeatDUST	14\|RepeatMasker	15\|RepeatTRF	16\|FS	17\|HRun	18\|QD	19\|ReadPosRankSum	20\|SNPEFF_Allele	21\|SNPEFF_Annotation	22\|SNPEFF_HGVS_c	23\|SNPEFF_HGVS_p	24\|SNPEFF_Feature_ID	25\|SNPEFF_CDS_pos	27\|AF_BFM	31\|AF_CMS	32\|AF_GAS	34\|AF_KES	35\|check_allele	36\|exon_start	37\|exon_end	38\|exon	39\|AGAP004707-RA	40\|AGAP004707-RB	41\|AGAP004707-RC	42\|Davies-C1N2	43\|Davies-C3N2	44\|Davies-C5N2	45\|Davies-C7N2	46\|Davies-C8N2	47\|Davies-C10N2	48\|Davies-C11N2	49\|Davies-C1N9	50\|Davies-C8N9	51\|Davies-C1N9ck
2L	2390177	2	G	A	198	0	True	3	8	False	False	False	0.12695	1	18.625	0.83496	A	missense_variant	n.761G>A	p.Arg254Lys	AGAP004707-RA	761	0.0	0.316363636364	0.214285714286	0.0	True	2390129	2390341	7	R254K	R254K	R254K	R260K	R217K	R260K	R260K	R271K	R260K	R260K	R260K	R271K	R260K
2L	2391228	3	G	C	10	0	True	0	12	False	False	False	2.0352	0	14.867	-1.1777	C	missense_variant	n.1204G>C	p.Val402Leu	AGAP004707-RA	1204	0.0724637681159	0.0	0.0	0.0	True	2391156	2391320	10	V402L	V402L	V402L	V408L	V365L		V408L	V419L	V408L	V408L	V408L	V419L	V408L
2L	2391228	3	G	T	9	1	True	0	12	False	False	False	2.0352	0	14.867	-1.1777	None	None	None	None	None	None	0.0652173913043	0.0	0.0	0.0	True	2391156	2391320	10	V402L	V402L	V402L	V408L	V365L		V408L	V419L	V408L	V408L	V408L	V419L	V408L
2L	2399997	2	G	C	38	0	True	1	7	False	False	False	13.359	0	15.688	0.11798	C	missense_variant	n.1396G>C	p.Asp466His	AGAP004707-RA	1396	0.0	0.0690909090909	0.0	0.0	True	2399898	2400173	11i+	D466H	D466H	D466H	D472H	D429H	D417H	D472H	D483H	D472H	D472H	D472H	D483H	D472H
2L	2400071	3	G	A	16	0	True	0	8	False	False	False	5.6875	0	16.969	1.3232	A	missense_variant	n.1470G>A	p.Met490Ile	AGAP004707-RA	1470	0.0	0.0	0.0	0.181818181818	True	2399898	2400173	11i+	M490I	M490I	M490I	M496I	M453I	M441I	M496I	M507I	M496I	M496I	M496I	M507I	M496I

0\|POS	1\|REF	2\|ALT	3\|AGAP004707-RA	4\|label
2390177	'G'	'A'	'R254K'	'2390177:G>A R254K'
2391228	'G'	'C,T'	'V402L'	'2391228:G>C,T V402L'
2399997	'G'	'C'	'D466H'	'2399997:G>C D466H'
2400071	'G'	'A,T'	'M490I'	'2400071:G>A,T M490I'
2416980	'C'	'T'	'T791M'	'2416980:C>T T791M'

	0	1	2	3	4	...	1525	1526	1527	1528	1529
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	2	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
18	0	0	0	0	0	...	0	0	0	0	0
19	0	0	0	1	0	...	0	0	0	0	0
20	0	0	0	0	0	...	0	0	0	0	0

	0	1	2	3	4	...	1525	1526	1527	1528	1529
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	0	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
115999	0	0	0	0	0	...	0	0	0	0	0
116000	0	0	0	0	0	...	0	0	0	0	0
116001	0	0	0	0	0	...	0	0	0	0	0

	label	ox_code	population	label_aug	country	region	sex	m_s	kt_2la	kt_2rb
index
0	AB0085-Ca	AB0085-C	BFS	AB0085-Ca [Burkina Faso, Pala, S, F]	Burkina Faso	Pala	F	S	2.0	2.0
1	AB0085-Cb	AB0085-C	BFS	AB0085-Cb [Burkina Faso, Pala, S, F]	Burkina Faso	Pala	F	S	2.0	2.0
2	AB0087-Ca	AB0087-C	BFM	AB0087-Ca [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	1.0
3	AB0087-Cb	AB0087-C	BFM	AB0087-Cb [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	1.0
4	AB0088-Ca	AB0088-C	BFM	AB0088-Ca [Burkina Faso, Bana, M, F]	Burkina Faso	Bana	F	M	2.0	0.0

0\|name	1\|chrom	2\|start	3\|stop
CHX	X	20009764	24393108
CH2R	2R	58984778	61545105
CH2L	2L	1	2431617
PEU2L	2L	2487770	5042389
IH2L	2L	5078962	5788875

	0	1	2	3	4	...	1525	1526	1527	1528	1529
0	0	0	0	0	0	...	0	0	0	0	0
1	0	0	2	0	0	...	0	0	0	0	0
2	0	0	0	0	0	...	0	0	0	0	0
...	...
18	0	0	0	0	0	...	0	0	0	0	0
19	0	0	0	1	0	...	0	0	0	0	0
20	0	0	0	0	0	...	0	0	0	0	0