Supplementary tables

Setup


In [1]:
%run setup.ipynb
%matplotlib inline

In [2]:
# load haplotypes
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
n_variants = haps.shape[0]
n_haps = haps.shape[1]
n_variants, n_haps


Out[2]:
(341998, 1530)

In [3]:
list(callset_haps)


Out[3]:
['haplotypes', 'POS', 'ANN']

In [4]:
callset_haps['ANN']


Out[4]:
array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'upstream_gene_variant', b'.', b'n.-9G>A'),
       (b'upstream_gene_variant', b'.', b'n.-9T>G'),
       (b'upstream_gene_variant', b'.', b'n.-9G>T')],
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [5]:
callset_phased = phase1_ar31.callset_phased
sorted(callset_phased['2L/variants'])


Out[5]:
['ALT', 'ID', 'POS', 'REF']

In [6]:
# load up haplotype group assignments from hierarchical clustering
hierarchical_group_membership = np.load('../data/hierarchical_cluster_membership.npy')
np.unique(hierarchical_group_membership)


Out[6]:
array([b'', b'F1', b'F2', b'F3', b'F4', b'F5', b'S1', b'S2', b'S3', b'S4',
       b'S5'],
      dtype='|S2')

In [7]:
# load up haplotype group assignments from network analysis
network_group_membership = np.load('../data/median_joining_network_membership.npy')
network_group_membership[network_group_membership == ''] = 'WT'
np.unique(network_group_membership)


Out[7]:
array(['F1', 'F2', 'F3', 'F4', 'F5', 'FX', 'L1', 'L2', 'S1', 'S2', 'S3',
       'S4', 'S5', 'SX', 'WT'],
      dtype='<U2')

In [8]:
# load up core haplotypes
with open('../data/core_haps.pkl', mode='rb') as f:
    core_haps = pickle.load(f)

Table of non-synonymous variants

One row per alternate allele.


In [9]:
tbl_variants = etl.frompickle('../data/tbl_variants_phase1.pkl')
tbl_variants.head()


Out[9]:
0|CHROM 1|POS 2|num_alleles 3|REF 4|ALT 5|AC 6|ALTIX 7|FILTER_PASS 8|NoCoverage 9|LowCoverage 10|HighCoverage 11|LowMQ 12|HighMQ0 13|RepeatDUST 14|RepeatMasker 15|RepeatTRF 16|FS 17|HRun 18|QD 19|ReadPosRankSum 20|SNPEFF_Allele 21|SNPEFF_Annotation 22|SNPEFF_HGVS_c 23|SNPEFF_HGVS_p 24|SNPEFF_Feature_ID 25|SNPEFF_CDS_pos 26|AF_AOM 27|AF_BFM 28|AF_GWA 29|AF_GNS 30|AF_BFS 31|AF_CMS 32|AF_GAS 33|AF_UGS 34|AF_KES 35|check_allele 36|exon_start 37|exon_end 38|exon 39|AGAP004707-RA 40|AGAP004707-RB 41|AGAP004707-RC 42|Davies-C1N2 43|Davies-C3N2 44|Davies-C5N2 45|Davies-C7N2 46|Davies-C8N2 47|Davies-C10N2 48|Davies-C11N2 49|Davies-C1N9 50|Davies-C8N9 51|Davies-C1N9ck
2L 2358254 2 G A 1 0 True 0 0 10 0 0 False False False 9.8672 1 17.547 -0.049988 A missense_variant n.97G>A p.Asp33Asn AGAP004707-RA 97 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 2358158 2358304 1 ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N') ('NON_SYNONYMOUS_CODING', 'D33N')
2L 2358316 2 T G 73 0 True 0 0 15 0 0 False False False 2.4844 0 16.438 1.4219 G intron_variant n.147+12T>G None AGAP004707-RA -1 0.0 0.0 0.0 0.0 0.0 0.132727272727 0.0 0.0 0.0 True None None None ('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691) ('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691) ('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '2j', -1324) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '3', -3673) ('INTRONIC', '1', 12, '2j', -1324) ('INTRONIC', '1', 12, '3', -3673)
2L 2358328 2 T C 2 0 True 0 0 14 0 0 False False False 2.7363 0 16.062 -0.646 C intron_variant n.147+24T>C None AGAP004707-RA -1 0.0 0.00724637681159 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True None None None ('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679) ('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679) ('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '2j', -1312) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '3', -3661) ('INTRONIC', '1', 24, '2j', -1312) ('INTRONIC', '1', 24, '3', -3661)
2L 2358353 2 C T 1 0 True 0 1 15 0 0 False False False 1.9512 0 9.8594 1.1582 T intron_variant n.147+49C>T None AGAP004707-RA -1 0.0 0.0 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True None None None ('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654) ('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654) ('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '2j', -1287) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '3', -3636) ('INTRONIC', '1', 49, '2j', -1287) ('INTRONIC', '1', 49, '3', -3636)
2L 2358405 2 T A 1 0 True 0 6 14 0 0 False False False 20.844 1 10.859 1.1562 A intron_variant n.147+101T>A None AGAP004707-RA -1 0.0 0.0 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True None None None ('INTRONIC', 'AGAP004707-PA', 101, 'AGAP004707-PA', -3602) ('INTRONIC', 'AGAP004707-PB', 101, 'AGAP004707-PB', -3602) ('INTRONIC', 'AGAP004707-PC', 101, 'AGAP004707-PC', -3602) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '2j', -1235) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '3', -3584) ('INTRONIC', '1', 101, '2j', -1235) ('INTRONIC', '1', 101, '3', -3584)

In [10]:
transcript_ids = [
    'AGAP004707-RA',
    'AGAP004707-RB',
    'AGAP004707-RC',
    'Davies-C1N2',
    'Davies-C3N2',
    'Davies-C5N2',
    'Davies-C7N2',
    'Davies-C8N2',
    'Davies-C10N2',
    'Davies-C11N2',
    'Davies-C1N9',
    'Davies-C8N9',
    'Davies-C1N9ck'
]

In [11]:
#load the codon map from the blog post (with the header info removed)
md_tbl = etl.fromtsv('../data/domestica_gambiae_map.txt')
md_tbl


Out[11]:
0|domestica_codon 1|gambiae_codon
1 1
2 2
3 3
4 4
5 5

...


In [12]:
dom_fs = pyfasta.Fasta('../data/domestica_gambiae_PROT_MEGA.fas')

#grab the right sample
dom = dom_fs.get('domestica_vgsc')

#remove the '-' from the aligned fasta so the numbering makes sense
dom_fix = [p for p in dom if p != '-']
#check
dom_fix[261-1],dom_fix[1945-1]


Out[12]:
('R', 'I')

In [13]:
# keep only the missense variants


def simplify_missense_effect(v):
    if v and v[0] == 'NON_SYNONYMOUS_CODING':
        return v[1]
    else:
        return ''
    
    
tbl_variants_missense = (
    tbl_variants
    .select(lambda row: any(row[t] and row[t][0] == 'NON_SYNONYMOUS_CODING' for t in transcript_ids))
    .convert(transcript_ids, simplify_missense_effect)
    .capture('AGAP004707-RA', pattern="([0-9]+)", newfields=['gambiae_codon'], include_original=True, fill=[''])
    .hashleftjoin(md_tbl, key='gambiae_codon', missing='')
    .replace('domestica_codon', '.', '')
    .convert('domestica_codon', lambda v: dom_fix[int(v) - 1] + v if v else v)
    .cut('CHROM', 'POS', 'REF', 'ALT', 'AC', 'exon', 'domestica_codon',
         'AGAP004707-RA', 'AGAP004707-RB', 'AGAP004707-RC', 'Davies-C1N2', 'Davies-C3N2', 'Davies-C5N2', 
         'Davies-C7N2', 'Davies-C8N2', 'Davies-C10N2', 'Davies-C11N2', 'Davies-C1N9', 'Davies-C8N9', 'Davies-C1N9ck',
         'AF_AOM', 'AF_BFM', 'AF_GWA', 'AF_GNS', 'AF_BFS', 'AF_CMS', 'AF_GAS', 'AF_UGS', 'AF_KES',
         'FILTER_PASS', 'NoCoverage', 'LowCoverage', 'HighCoverage', 'LowMQ', 'HighMQ0', 'RepeatDUST', 'RepeatMasker', 'RepeatTRF', 'FS', 'HRun', 'QD', 'ReadPosRankSum',
         )
    

)
tbl_variants_missense.displayall()


0|CHROM 1|POS 2|REF 3|ALT 4|AC 5|exon 6|domestica_codon 7|AGAP004707-RA 8|AGAP004707-RB 9|AGAP004707-RC 10|Davies-C1N2 11|Davies-C3N2 12|Davies-C5N2 13|Davies-C7N2 14|Davies-C8N2 15|Davies-C10N2 16|Davies-C11N2 17|Davies-C1N9 18|Davies-C8N9 19|Davies-C1N9ck 20|AF_AOM 21|AF_BFM 22|AF_GWA 23|AF_GNS 24|AF_BFS 25|AF_CMS 26|AF_GAS 27|AF_UGS 28|AF_KES 29|FILTER_PASS 30|NoCoverage 31|LowCoverage 32|HighCoverage 33|LowMQ 34|HighMQ0 35|RepeatDUST 36|RepeatMasker 37|RepeatTRF 38|FS 39|HRun 40|QD 41|ReadPosRankSum
2L 2358254 G A 1 1 E33 D33N D33N D33N D33N D33N D33N D33N D33N D33N D33N D33N D33N D33N 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 0 10 0 0 False False False 9.8672 1 17.547 -0.049988
2L 2359670 G A 7 2j E60K E60K 0.0 0.0 0.0 0.0 0.0 0.0109090909091 0.0 0.0 0.0113636363636 False 1 171 1 1 0 False False False 8.6641 6 14.406 -0.029007
2L 2362002 A T 2 3 D54V D54V D54V D54V D65V D54V D54V D54V D65V D54V 0.0 0.0144927536232 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 1 3 0 0 False False False 0.5459 0 12.531 -0.55322
2L 2362019 G T 2 3 G61 G54C G54C G54C G60C G60C G60C G60C G71C G60C G60C G60C G71C G60C 0.0 0.0144927536232 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 6 0 0 False False False 3.9824 0 13.641 0.7749
2L 2362023 C T 1 3 P62 P55L P55L P55L P61L P61L P61L P61L P72L P61L P61L P61L P72L P61L 0.0 0.0 0.0 0.0 0.00617283950617 0.0 0.0 0.0 0.0 True 0 1 3 0 0 False False False 0.0 0 13.477 -1.1611
2L 2390168 A G 2 7 K258 K251R K251R K251R K257R K214R K257R K257R K268R K257R K257R K257R K268R K257R 0.0 0.0 0.0 0.0 0.0 0.0 0.0178571428571 0.0 0.0 True 0 2 10 0 0 False False False 0.56982 1 15.219 -0.026001
2L 2390177 G A 198 7 R261 R254K R254K R254K R260K R217K R260K R260K R271K R260K R260K R260K R271K R260K 0.0 0.0 0.0 0.0 0.0 0.316363636364 0.214285714286 0.0 0.0 True 0 3 8 0 0 False False False 0.12695 1 18.625 0.83496
2L 2390311 G A 1 7 E306 E299K E299K E299K E305K E262K E305K E305K E316K E305K E305K E305K E316K E305K 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 0 10 0 0 False False False 0.0 3 14.07 -0.70996
2L 2390448 G A 6 8 G324 G317S G317S G317S G323S G280S G323S G323S G334S G323S G323S G323S G334S G323S 0.0 0.0 0.0 0.0 0.0 0.0109090909091 0.0 0.0 0.0 True 0 0 15 0 0 False False False 0.71094 0 16.125 -0.65918
2L 2391228 G C 10 10 V410 V402L V402L V402L V408L V365L V408L V419L V408L V408L V408L V419L V408L 0.0 0.0724637681159 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 12 0 0 False False False 2.0352 0 14.867 -1.1777
2L 2391228 G T 9 10 V410 V402L V402L V402L V408L V365L V408L V419L V408L V408L V408L V419L V408L 0.0 0.0652173913043 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 12 0 0 False False False 2.0352 0 14.867 -1.1777
2L 2399997 G C 38 11i+ D466H D466H D466H D472H D429H D417H D472H D483H D472H D472H D472H D483H D472H 0.0 0.0 0.0 0.0 0.0 0.0690909090909 0.0 0.0 0.0 True 0 1 7 0 0 False False False 13.359 0 15.688 0.11798
2L 2400071 G A 16 11i+ M508 M490I M490I M490I M496I M453I M441I M496I M507I M496I M496I M496I M507I M496I 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.181818181818 True 0 0 8 0 0 False False False 5.6875 0 16.969 1.3232
2L 2400071 G T 2 11i+ M508 M490I M490I M490I M496I M453I M441I M496I M507I M496I M496I M496I M507I M496I 0.0 0.0 0.0 0.0 0.0 0.00363636363636 0.0 0.0 0.0 True 0 0 8 0 0 False False False 5.6875 0 16.969 1.3232
2L 2400076 T A 2 11i+ I510 I492N I492N I492N I498N I455N I443N I498N I509N I498N I498N I498N I509N I498N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00970873786408 0.0 True 0 0 9 0 0 False False False 1.1055 1 10.469 -0.54199
2L 2402484 G T 1 13a R555 R537L R537L R537L R550L R550L R550L 0.0 0.0 0.0 0.0 0.0 0.0 0.00892857142857 0.0 0.0 True 0 4 13 0 0 False False False 1.8857 1 14.719 0.075989
2L 2402508 A T 1 13a K563 Q545L Q545L Q545L Q558L Q558L Q558L 0.0 0.0 0.0 0.0 0.00617283950617 0.0 0.0 0.0 0.0 True 1 3 15 0 0 False False False 3.0098 0 11.156 -0.041992
2L 2403206 G A 1 14 A603 A586T A586T A586T A599T A528T A516T A571T A582T A571T A571T A599T A582T A599T 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 False 0 1 16 0 0 False False False 4.8047 0 15.477 -0.20398
2L 2407811 G A 1 15 R697 R670Q R670Q R670Q R683Q R612Q R600Q R655Q R666Q R655Q R655Q R683Q R666Q R683Q 0.0 0.0 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True 0 2 14 0 0 False False False 0.0 1 10.789 -0.86914
2L 2407912 A T 1 16 T706 T679S T679S T679S T692S T621S T609S T664S T675S T664S T664S T692S T675S T692S 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 0 6 0 0 False False False 0.0 1 22.172 0.5332
2L 2407987 A G 1 16 M731 M704V M704V M704V M717V M646V M634V M689V M700V M689V M689V M717V M700V M717V 0.0 0.00724637681159 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 2 7 0 0 False False False 0.0 0 14.102 1.0859
2L 2408125 C A 1 17 A751 A724E A724E A724E A737E A666E A654E A709E A720E A709E A709E A737E A720E A737E 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 False 0 252 2 1 0 False False False 2.0879 2 7.0586 0.5918
2L 2408136 G A 1 17 G755 G728R G728R G728R G741R G670R G658R G713R G724R G713R G713R G741R G724R G741R 0.00833333333333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 False 0 318 2 1 0 False False False 2.0879 0 1.8799 -2.0742
2L 2416980 C T 32 18b+ T810 T791M T791M T791M T804M T733M T721M T776M T787M T776M T776M T804M T787M T804M 0.0 0.0144927536232 0.0 0.129032258065 0.135802469136 0.0 0.0 0.0 0.0 True 0 2 11 0 0 False False False 2.0449 0 17.703 0.37305
2L 2417206 A C 1 19 I848 I829L I829L I829L I842L I771L I759L I814L I825L I814L I814L I842L I825L I842L 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 15 3 0 0 False False False 0.0 2 9.4297 0.31299
2L 2417231 C T 1 19 A856 A837V A837V A837V A850V A779V A767V A822V A833V A822V A822V A850V A833V A850V 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 38 1 0 0 False False False 3.6992 0 13.688 -0.46802
2L 2417772 A T 1 20c M944 M925L M925L M938L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00485436893204 0.0 True 0 2 10 0 0 False False False 7.707 1 16.156 -0.035004
2L 2421454 A G 1 20d M903V M916V M845V M833V M888V M899V M888V M888V M916V M899V 0.0 0.00724637681159 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 13 0 0 False False False 3.8008 1 14.094 0.88916
2L 2422486 C A 1 21 K959 P940H P940H P940H P953H P882H P870H P925H P936H P925H P925H P953H P936H P953H 0.0 0.0 0.0 0.0 0.00617283950617 0.0 0.0 0.0 0.0 True 0 1 6 0 0 False False False 4.168 0 12.102 -0.45605
2L 2422651 T C 430 21 L1014 L995S L995S L995S L1008S L937S L925S L980S L991S L980S L980S L1008S L991S L1008S 0.0 0.0 0.0 0.0 0.0 0.154545454545 0.642857142857 1.0 0.761363636364 True 0 2 10 0 0 False False False 0.95117 0 27.203 -0.92822
2L 2422652 A T 775 21 L1014 L995F L995F L995F L1008F L937F L925F L980F L991F L980F L980F L1008F L991F L1008F 0.858333333333 0.847826086957 0.0 1.0 1.0 0.529090909091 0.357142857143 0.0 0.0 True 0 2 9 0 0 False False False 0.73291 3 29.047 0.19104
2L 2422875 G A 1 22 I1070 V1051I V1051I V1051I V1064I V993I V981I V1036I V1047I V1036I V1036I V1064I V1047I V1064I 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 False 0 3 17 0 0 False False False 0.71289 0 14.867 -1.6621
2L 2424383 G T 1 23f+ K1133 A1125S A1125S A1125S A1138S A1067S A1045S A1100S A1121S A1110S A1110S A1138S A1121S A1138S 0.0 0.0 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True 0 1 14 0 0 False False False 2.8906 0 18.156 -0.31494
2L 2424384 C T 11 23f+ K1133 A1125V A1125V A1125V A1138V A1067V A1045V A1100V A1121V A1110V A1110V A1138V A1121V A1138V 0.0916666666667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 1 14 0 0 False False False 5.4141 0 14.172 -2.3242
2L 2424401 A G 2 23f+ F1139 I1131V I1131V I1131V I1144V I1073V I1051V I1106V I1127V I1116V I1116V I1144V I1127V I1144V 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.00892857142857 0.0 0.0 False 0 0 16 0 0 False False False 4.1133 0 14.742 -1.7324
2L 2424720 T G 11 24h+ S1167 S1160A S1173A S1156A S1173A 0.00833333333333 0.0144927536232 0.0108695652174 0.0 0.0 0.0109090909091 0.00892857142857 0.0 0.0 False 0 2 8 0 0 False False False 1953.0 1 0.090027 1.6211
2L 2425077 G A 5 25 I1262 V1254I V1228I V1228I V1241I V1170I V1148I V1203I V1224I V1213I V1213I V1267I V1250I V1267I 0.0 0.0 0.054347826087 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 14 0 0 False False False 0.66992 0 14.891 -0.61816
2L 2425291 T C 1 26 V1311 V1303A V1277A V1277A V1290A V1219A V1197A V1252A V1273A V1262A V1262A V1316A V1299A V1316A 0.0 0.0 0.0 0.0 0.00617283950617 0.0 0.0 0.0 0.0 True 0 0 10 0 0 False False False 4.4258 0 15.32 -0.38403
2L 2425417 A T 3 26 N1353 N1345I N1319I N1319I N1332I N1261I N1239I N1294I N1315I N1304I N1304I N1358I N1341I N1358I 0.0 0.0217391304348 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 13 0 0 False False False 0.39111 1 14.141 0.104
2L 2428015 C T 1 27l W1374 L1366F L1340F L1340F L1353F L1282F L1260F L1315F L1336F L1325F L1325F L1379F L1362F 0.0 0.0 0.0 0.0 0.00617283950617 0.0 0.0 0.0 0.0 True 0 3 12 0 0 False False False 0.66895 3 13.711 -0.55322
2L 2429617 T C 19 30 I1532 I1527T I1501T I1501T I1511T I1440T I1418T I1473T I1494T I1483T I1483T I1537T I1520T I1537T 0.0 0.13768115942 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 2 12 0 0 False False False 1.4502 0 15.531 -0.98291
2L 2429622 T C 1 30 F1534 F1529L F1503L F1503L F1513L F1442L F1420L F1475L F1496L F1485L F1485L F1539L F1522L F1539L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00485436893204 0.0 True 0 1 15 0 0 False False False 10.922 1 14.977 -0.098022
2L 2429745 A T 110 30 N1575 N1570Y N1544Y N1544Y N1554Y N1483Y N1461Y N1516Y N1537Y N1526Y N1526Y N1580Y N1563Y N1580Y 0.0 0.260869565217 0.0 0.0967741935484 0.216049382716 0.06 0.0 0.0 0.0 False 0 0 17 0 0 False False False 0.4541 1 16.484 1.9131
2L 2429788 T C 2 30 I1589 I1584T I1558T I1558T I1568T I1497T I1475T I1530T I1551T I1540T I1540T I1594T I1577T I1594T 0.0 0.0 0.0 0.0 0.0123456790123 0.0 0.0 0.0 0.0 False 0 0 24 0 0 False False False 6.2383 0 19.406 -0.46509
2L 2429897 A G 11 31 E1602 E1597G E1571G E1571G E1581G E1510G E1488G E1543G E1564G E1553G E1553G E1607G E1590G E1607G 0.0 0.0 0.0 0.0645161290323 0.0432098765432 0.0 0.0 0.0 0.0 True 0 1 13 0 0 False False False 3.5234 1 16.906 -0.56006
2L 2429915 A C 7 31 K1608 K1603T K1577T K1577T K1587T K1516T K1494T K1549T K1570T K1559T K1559T K1613T K1596T K1613T 0.0 0.0507246376812 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 1 13 0 0 False False False 5.2461 0 16.406 -1.6729
2L 2429958 A C 1 31 F1622 L1617F L1591F L1591F L1601F L1530F L1508F L1563F L1584F L1573F L1573F L1627F L1610F L1627F 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 1 14 0 0 False False False 1.0488 0 12.539 -1.8672
2L 2430106 T A 3 31 L1672 L1667M L1641M L1641M L1651M L1580M L1558M L1613M L1634M L1623M L1623M L1677M L1660M L1677M 0.0 0.0217391304348 0.0 0.0 0.0 0.0 0.0 0.0 0.0 False 0 0 18 0 0 False False False 1.6719 0 15.719 -1.3877
2L 2430229 G A 4 32 V1686 V1681I V1655I V1655I V1665I V1594I V1572I V1627I V1648I V1637I V1637I V1691I V1674I V1691I 0.0 0.0 0.0 0.0 0.0 0.00727272727273 0.0 0.0 0.0 False 0 1 18 0 0 False False False 20.531 0 14.383 0.22498
2L 2430236 G T 7 32 S1688 S1683I S1657I S1657I S1667I S1596I S1574I S1629I S1650I S1639I S1639I S1693I S1676I S1693I 0.0 0.0 0.0 0.0 0.0 0.0127272727273 0.0 0.0 0.0 False 0 1 17 0 0 False False False 1.0479 1 14.352 -0.62988
2L 2430424 G T 28 32 A1751 A1746S A1720S A1720S A1730S A1659S A1637S A1692S A1713S A1702S A1702S A1756S A1739S A1756S 0.0 0.0 0.0 0.112903225806 0.12962962963 0.0 0.0 0.0 0.0 True 0 1 10 0 0 False False False 4.4297 3 17.094 -0.71924
2L 2430451 C T 1 32 H1760 H1755Y H1729Y H1729Y H1739Y H1668Y H1646Y H1701Y H1722Y H1711Y H1711Y H1765Y H1748Y H1765Y 0.0 0.0 0.0 0.0 0.0 0.0 0.00892857142857 0.0 0.0 True 0 2 10 0 0 False False False 0.56689 0 16.047 0.098022
2L 2430817 G A 13 33 V1858 V1853I V1827I V1827I V1837I V1766I V1744I V1799I V1820I V1809I V1809I V1863I V1846I V1863I 0.0 0.0 0.0 0.0806451612903 0.0493827160494 0.0 0.0 0.0 0.0 True 0 0 9 0 0 False False False 1.3701 0 14.648 -0.66504
2L 2430863 T C 52 33 I1873 I1868T I1842T I1842T I1852T I1781T I1759T I1814T I1835T I1824T I1824T I1878T I1861T I1878T 0.0 0.0 0.0 0.177419354839 0.253086419753 0.0 0.0 0.0 0.0 True 0 1 8 0 0 False False False 1.4785 0 15.812 0.78809
2L 2430880 C T 29 33 P1879 P1874S P1848S P1848S P1858S P1787S P1765S P1820S P1841S P1830S P1830S P1884S P1867S P1884S 0.0 0.210144927536 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 1 10 0 0 False False False 19.312 1 16.547 -1.8408
2L 2430881 C T 80 33 P1879 P1874L P1848L P1848L P1858L P1787L P1765L P1820L P1841L P1830L P1830L P1884L P1867L P1884L 0.0 0.0724637681159 0.0 0.451612903226 0.259259259259 0.0 0.0 0.0 0.0 True 0 1 9 0 0 False False False 2.4453 1 18.562 -1.4189
2L 2431019 T C 12 33 Y1925 F1920S F1894S F1894S F1904S F1833S F1811S F1866S F1887S F1876S F1876S F1930S F1913S F1930S 0.0 0.0 0.0 0.0 0.0 0.0145454545455 0.0357142857143 0.0 0.0 True 0 0 5 0 0 False False False 0.85107 1 13.523 -1.6211
2L 2431061 C T 16 33 A1939 A1934V A1908V A1908V A1918V A1847V A1825V A1880V A1901V A1890V A1890V A1944V A1927V A1944V 0.0 0.115942028986 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 8 0 0 False False False 1.5 1 16.688 -0.46802
2L 2431079 T C 44 33 I1945 I1940T I1914T I1914T I1924T I1853T I1831T I1886T I1907T I1896T I1896T I1950T I1933T I1950T 0.0 0.0434782608696 0.0 0.0 0.0 0.0690909090909 0.0 0.0 0.0 True 0 0 6 0 0 False False False 3.4004 0 15.953 2.1738
2L 2431232 G A 1 33 G1992 G1991E G1965E G1965E G1975E G1904E G1882E G1937E G1958E G1947E G1947E G2001E G1984E G2001E 0.0 0.0 0.0108695652174 0.0 0.0 0.0 0.0 0.0 0.0 True 0 0 7 0 0 False False False 3.1719 2 12.328 -0.80713
2L 2431331 C A 1 33 S2018 T2024K T1998K T1998K T2008K T1937K T1915K T1970K T1991K T1980K T1980K T2034K T2017K T2034K 0.0 0.0 0.0 0.0 0.0 0.00181818181818 0.0 0.0 0.0 True 0 1 5 0 0 False False False 4.1406 2 12.797 -0.031006
2L 2431417 A G 2 33 D2046 I2053V I2027V I2027V I2037V I1966V I1944V I1999V I2020V I2009V I2009V I2063V I2046V I2063V 0.0 0.0 0.0 0.0 0.0 0.00363636363636 0.0 0.0 0.0 True 0 0 5 0 0 False False False 10.93 0 13.789 -1.9297
2L 2431487 G C 5 33 S2076T S2050T S2050T S2060T S1989T S1967T S2022T S2043T S2032T S2032T S2086T S2069T S2086T 0.0 0.0 0.0 0.0 0.0 0.00909090909091 0.0 0.0 0.0 True 0 1 6 0 0 False False False 0.69678 1 14.32 -0.78418

In [14]:
tbl_variants_missense.nrows()


Out[14]:
63

In [15]:
tbl_variants_missense.tocsv('../data/supp_table_variants_missense.csv')

Table of haplotype tracking variants

One row per variant. Only biallelic obviously because haplotype tagging.

Columns:

  • CHROM
  • POS
  • REF
  • ALT
  • AC
  • AF
  • AF_F1
  • AF_F2
  • AF_F3
  • AF_F4
  • AF_F5
  • AF_S1
  • AF_S2
  • AF_S3
  • AF_S4
  • AF_S5
  • AF_L1
  • AF_L2
  • AF_WT

In [16]:
list(callset_phased['2L/variants'])


Out[16]:
['ALT', 'ID', 'POS', 'REF']

In [17]:
region_vgsc


Out[17]:
<SeqFeature 'Vgsc' 2L:2358158-2431617>

In [18]:
pos = allel.SortedIndex(callset_phased['2L/variants/POS'])
pos


Out[18]:
<SortedIndex shape=(8296600,) dtype=int32>
01234...82965958296596829659782965988296599
4468844691447324473644756...4935642449356425493564264935642949356435

In [19]:
# "_tr" means tracking region, i.e., genome region we'll report data for tracking SNPs
loc_tr = pos.locate_range(region_vgsc.start - 10000, region_vgsc.end + 10000)
loc_tr


Out[19]:
slice(24261, 26528, None)

In [20]:
pos_tr = pos[loc_tr]
pos_tr


Out[20]:
<SortedIndex shape=(2267,) dtype=int32>
01234...22622263226422652266
23487652348777234881523488162348824...24415872441589244159324416162441617

In [21]:
haps_tr = allel.GenotypeArray(callset_phased['2L/calldata/genotype'][loc_tr, :-8]).to_haplotypes()
haps_tr


Out[21]:
<HaplotypeArray shape=(2267, 1530) dtype=int8>
01234...15251526152715281529
000000...00000
100000...00000
211111...11111
......
226400000...00000
226500000...00000
226600000...00000

In [22]:
ac_tr = haps_tr.count_alleles(max_allele=1)
ac_tr


Out[22]:
<AlleleCountsArray shape=(2267, 2) dtype=int32>
01
01529 1
11529 1
2 441486
......
22641529 1
22651528 2
22661529 1

In [23]:
af_tr = ac_tr.to_frequencies()
af_tr


Out[23]:
array([[  9.99346405e-01,   6.53594771e-04],
       [  9.99346405e-01,   6.53594771e-04],
       [  2.87581699e-02,   9.71241830e-01],
       ..., 
       [  9.99346405e-01,   6.53594771e-04],
       [  9.98692810e-01,   1.30718954e-03],
       [  9.99346405e-01,   6.53594771e-04]])

In [24]:
subpops = dict()
for p in 'F1 F2 F3 F4 F5 S1 S2 S3 S4 S5 L1 L2 WT'.split():
    subpops[p] = np.nonzero(network_group_membership == p)[0]
sorted((k, len(v)) for k, v in subpops.items())


Out[24]:
[('F1', 456),
 ('F2', 13),
 ('F3', 50),
 ('F4', 35),
 ('F5', 187),
 ('L1', 19),
 ('L2', 16),
 ('S1', 107),
 ('S2', 78),
 ('S3', 165),
 ('S4', 36),
 ('S5', 34),
 ('WT', 291)]

In [25]:
ac_subpops_tr = haps_tr.count_alleles_subpops(subpops, max_allele=1)

In [26]:
af_subpops_tr = {k: ac.to_frequencies()[:, 1] for k, ac in ac_subpops_tr.items()}

In [27]:
columns = [
    ('CHROM', np.asarray(['2L'] * haps_tr.shape[0], dtype=object)),
    ('POS', callset_phased['2L/variants/POS'][loc_tr]),
    ('REF', callset_phased['2L/variants/REF'][loc_tr].astype('U')),
    ('ALT', callset_phased['2L/variants/ALT'][loc_tr].astype('U')),
    ('AC', ac_tr[:, 1]),
    ('AF', af_tr[:, 1]),
    ('MAC', ac_tr.min(axis=1)),
    ('MAF', af_tr.min(axis=1)),
] + sorted(('AF_' + k, v) for k, v in af_subpops_tr.items())
df_tr = pandas.DataFrame.from_items(columns)
df_tr.head()


Out[27]:
CHROM POS REF ALT AC AF MAC MAF AF_F1 AF_F2 ... AF_F4 AF_F5 AF_L1 AF_L2 AF_S1 AF_S2 AF_S3 AF_S4 AF_S5 AF_WT
0 2L 2348765 C A 1 0.000654 1 0.000654 0.000000 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.003436
1 2L 2348777 G T 1 0.000654 1 0.000654 0.000000 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.003436
2 2L 2348815 T C 1486 0.971242 44 0.028758 0.995614 1.0 ... 1.0 1.0 0.684211 1.0 1.0 1.0 1.0 1.0 1.0 0.914089
3 2L 2348816 C G 1 0.000654 1 0.000654 0.000000 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.003436
4 2L 2348824 T A 39 0.025490 39 0.025490 0.000000 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.134021

5 rows × 21 columns


In [28]:
df_tr_mac = df_tr[df_tr.MAC > 14].reset_index(drop=True)
df_tr_mac.head()


Out[28]:
CHROM POS REF ALT AC AF MAC MAF AF_F1 AF_F2 ... AF_F4 AF_F5 AF_L1 AF_L2 AF_S1 AF_S2 AF_S3 AF_S4 AF_S5 AF_WT
0 2L 2348815 T C 1486 0.971242 44 0.028758 0.995614 1.0 ... 1.0 1.000000 0.684211 1.0 1.0 1.0 1.0 1.0 1.0 0.914089
1 2L 2348824 T A 39 0.025490 39 0.025490 0.000000 0.0 ... 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.134021
2 2L 2348869 T A 329 0.215033 329 0.215033 0.000000 0.0 ... 1.0 0.994652 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.168385
3 2L 2348877 G T 23 0.015033 23 0.015033 0.000000 0.0 ... 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.079038
4 2L 2348929 A G 100 0.065359 100 0.065359 0.000000 0.0 ... 1.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.041237

5 rows × 21 columns


In [29]:
# check how many potentially diagnostics SNPs separating each pair of F haplotype groups
for x, y in itertools.combinations('F1 F2 F3 F4 F5'.split(), 2):
    print(x, y, np.count_nonzero(np.abs(df_tr_mac['AF_' + x] - df_tr_mac['AF_' + y]) > 0.98))


F1 F2 68
F1 F3 61
F1 F4 62
F1 F5 62
F2 F3 49
F2 F4 50
F2 F5 48
F3 F4 7
F3 F5 19
F4 F5 20

In [30]:
# check how many potentially diagnostics SNPs separating each pair of S haplotype groups
for x, y in itertools.combinations('S1 S2 S3 S4 S5'.split(), 2):
    print(x, y, np.count_nonzero(np.abs(df_tr_mac['AF_' + x] - df_tr_mac['AF_' + y]) > 0.98))


S1 S2 91
S1 S3 99
S1 S4 94
S1 S5 92
S2 S3 47
S2 S4 39
S2 S5 47
S3 S4 55
S3 S5 55
S4 S5 8

In [31]:
df_tr_mac.to_csv('../data/supp_table_variants_tracking.csv', index=False, float_format='%.3f')

Table of haplotypes panel


In [32]:
df_haps = pandas.read_csv('../data/ag1000g.phase1.AR3.1.haplotypes.meta.txt', sep='\t', index_col='index')[:-16]
df_haps.head()


Out[32]:
label ox_code population label_aug country region sex m_s kt_2la kt_2rb
index
0 AB0085-Ca AB0085-C BFS AB0085-Ca [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
1 AB0085-Cb AB0085-C BFS AB0085-Cb [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
2 AB0087-Ca AB0087-C BFM AB0087-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
3 AB0087-Cb AB0087-C BFM AB0087-Cb [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
4 AB0088-Ca AB0088-C BFM AB0088-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 0.0

In [33]:
n_haps


Out[33]:
1530

In [34]:
core_hap_col = np.empty(n_haps, dtype=object)
for k, s in core_haps.items():
    core_hap_col[sorted(s)] = k
core_hap_col


Out[34]:
array(['F1', 'F1', 'L1', ..., 'F1', 'F1', 'F1'], dtype=object)

In [35]:
df_haps_out = (
    df_haps[['label', 'ox_code', 'population', 'country', 'region', 'sex']]
    .rename(columns={'label': 'haplotype_id', 'ox_code': 'sample_id'})
    .copy()
)
df_haps_out['core_haplotype'] = core_hap_col
df_haps_out['network_haplotype_group'] = network_group_membership
df_haps_out['hierarchy_haplotype_group'] = hierarchical_group_membership.astype('U')
df_haps_out


Out[35]:
haplotype_id sample_id population country region sex core_haplotype network_haplotype_group hierarchy_haplotype_group
index
0 AB0085-Ca AB0085-C BFS Burkina Faso Pala F F1 F1 F1
1 AB0085-Cb AB0085-C BFS Burkina Faso Pala F F1 F1 F1
2 AB0087-Ca AB0087-C BFM Burkina Faso Bana F L1 L1
3 AB0087-Cb AB0087-C BFM Burkina Faso Bana F F1 F1 F1
4 AB0088-Ca AB0088-C BFM Burkina Faso Bana F F1 F1 F1
5 AB0088-Cb AB0088-C BFM Burkina Faso Bana F F1 F1 F1
6 AB0089-Ca AB0089-C BFM Burkina Faso Bana F F1 F1 F1
7 AB0089-Cb AB0089-C BFM Burkina Faso Bana F F1 F1 F1
8 AB0090-Ca AB0090-C BFM Burkina Faso Bana F F1 F1 F1
9 AB0090-Cb AB0090-C BFM Burkina Faso Bana F L1 L1
10 AB0091-Ca AB0091-C BFM Burkina Faso Bana F F1 F1 F1
11 AB0091-Cb AB0091-C BFM Burkina Faso Bana F L1 L1
12 AB0092-Ca AB0092-C BFM Burkina Faso Bana F F1 F1 F1
13 AB0092-Cb AB0092-C BFM Burkina Faso Bana F F1 F1 F1
14 AB0094-Ca AB0094-C BFM Burkina Faso Bana F F1 F1 F1
15 AB0094-Cb AB0094-C BFM Burkina Faso Bana F F1 F1 F1
16 AB0095-Ca AB0095-C BFM Burkina Faso Bana F F1 F1 F1
17 AB0095-Cb AB0095-C BFM Burkina Faso Bana F F1 F1 F1
18 AB0097-Ca AB0097-C BFM Burkina Faso Bana F F1 F1 F1
19 AB0097-Cb AB0097-C BFM Burkina Faso Bana F F1 F1 F1
20 AB0098-Ca AB0098-C BFM Burkina Faso Bana F F1 F1 F1
21 AB0098-Cb AB0098-C BFM Burkina Faso Bana F F1 F1 F1
22 AB0099-Ca AB0099-C BFM Burkina Faso Bana F F1 F1 F1
23 AB0099-Cb AB0099-C BFM Burkina Faso Bana F F1 F1 F1
24 AB0100-Ca AB0100-C BFM Burkina Faso Bana F F1 F1 F1
25 AB0100-Cb AB0100-C BFM Burkina Faso Bana F F1 F1 F1
26 AB0101-Ca AB0101-C BFM Burkina Faso Bana F F1 F1 F1
27 AB0101-Cb AB0101-C BFM Burkina Faso Bana F F1 F1 F1
28 AB0103-Ca AB0103-C BFS Burkina Faso Bana F F1 F1 F1
29 AB0103-Cb AB0103-C BFS Burkina Faso Bana F F1 F1 F1
... ... ... ... ... ... ... ... ... ...
1500 AV0026-Ca AV0026-C GNS Guinea Koundara F F1 F1 F1
1501 AV0026-Cb AV0026-C GNS Guinea Koundara F F1 F1 F1
1502 AV0027-Ca AV0027-C GNS Guinea Koundara F F1 F1 F1
1503 AV0027-Cb AV0027-C GNS Guinea Koundara F F1 F1 F1
1504 AV0029-Ca AV0029-C GNS Guinea Koundara F F1 F1 F1
1505 AV0029-Cb AV0029-C GNS Guinea Koundara F F1 F1 F1
1506 AV0030-Ca AV0030-C GNS Guinea Koundara F F1 F1 F1
1507 AV0030-Cb AV0030-C GNS Guinea Koundara F F1 F1 F1
1508 AV0031-Ca AV0031-C GNS Guinea Koundara F F1 F1 F1
1509 AV0031-Cb AV0031-C GNS Guinea Koundara F F1 F1 F1
1510 AV0032-Ca AV0032-C GNS Guinea Koundara F F1 F1 F1
1511 AV0032-Cb AV0032-C GNS Guinea Koundara F F1 F1 F1
1512 AV0033-Ca AV0033-C GNS Guinea Koundara F F1 F1 F1
1513 AV0033-Cb AV0033-C GNS Guinea Koundara F F1 F1 F1
1514 AV0034-Ca AV0034-C GNS Guinea Koundara F F1 F1 F1
1515 AV0034-Cb AV0034-C GNS Guinea Koundara F F1 F1 F1
1516 AV0035-Ca AV0035-C GNS Guinea Koundara F F1 F1 F1
1517 AV0035-Cb AV0035-C GNS Guinea Koundara F F1 F1 F1
1518 AV0036-Ca AV0036-C GNS Guinea Koundara F F1 F1 F1
1519 AV0036-Cb AV0036-C GNS Guinea Koundara F F1 F1 F1
1520 AV0039-Ca AV0039-C GNS Guinea Koundara F F1 F1 F1
1521 AV0039-Cb AV0039-C GNS Guinea Koundara F F1 F1 F1
1522 AV0041-Ca AV0041-C GNS Guinea Koundara F F1 F1 F1
1523 AV0041-Cb AV0041-C GNS Guinea Koundara F F1 F1 F1
1524 AV0044-Ca AV0044-C GNS Guinea Koundara F F1 F1 F1
1525 AV0044-Cb AV0044-C GNS Guinea Koundara F F1 F1 F1
1526 AV0045-Ca AV0045-C GNS Guinea Koundara F F1 F1 F1
1527 AV0045-Cb AV0045-C GNS Guinea Koundara F F1 F1 F1
1528 AV0047-Ca AV0047-C GNS Guinea Koundara F F1 F1 F1
1529 AV0047-Cb AV0047-C GNS Guinea Koundara F F1 F1 F1

1530 rows × 9 columns


In [36]:
df_haps_out[df_haps_out.core_haplotype == 'L1']


Out[36]:
haplotype_id sample_id population country region sex core_haplotype network_haplotype_group hierarchy_haplotype_group
index
2 AB0087-Ca AB0087-C BFM Burkina Faso Bana F L1 L1
9 AB0090-Cb AB0090-C BFM Burkina Faso Bana F L1 L1
11 AB0091-Cb AB0091-C BFM Burkina Faso Bana F L1 L1
34 AB0110-Ca AB0110-C BFM Burkina Faso Bana F L1 L1
38 AB0112-Ca AB0112-C BFM Burkina Faso Bana F L1 L1
75 AB0138-Cb AB0138-C BFM Burkina Faso Sourukoudinga F L1 L1
135 AB0181-Cb AB0181-C BFM Burkina Faso Bana F L1 L1
149 AB0188-Cb AB0188-C BFM Burkina Faso Bana F L1 L1
157 AB0192-Cb AB0192-C BFM Burkina Faso Bana F L1 L1
170 AB0204-Ca AB0204-C BFM Burkina Faso Pala F L1 L1
186 AB0212-Ca AB0212-C BFM Burkina Faso Bana F L1 L1
208 AB0229-Ca AB0229-C BFM Burkina Faso Sourukoudinga F L1 L1
226 AB0240-Ca AB0240-C BFM Burkina Faso Sourukoudinga F L1 L1
231 AB0242-Cb AB0242-C BFM Burkina Faso Sourukoudinga F L1 WT
237 AB0246-Cb AB0246-C BFM Burkina Faso Sourukoudinga F L1 L1
251 AB0257-Cb AB0257-C BFM Burkina Faso Pala F L1 L1
261 AB0263-Cb AB0263-C BFM Burkina Faso Pala F L1 L1
266 AB0266-Ca AB0266-C BFM Burkina Faso Pala F L1 L1
269 AB0267-Cb AB0267-C BFM Burkina Faso Pala F L1 L1
527 AJ0051-Cb AJ0051-C GWA Guinea-Bissau Antula F L1 WT

In [37]:
df_haps_out[df_haps_out.core_haplotype == 'L2']


Out[37]:
haplotype_id sample_id population country region sex core_haplotype network_haplotype_group hierarchy_haplotype_group
index
599 AK0065-Cb AK0065-C KES Kenya Kilifi-Mbogolo F L2 L2
607 AK0069-Cb AK0069-C KES Kenya Kilifi-Mbogolo F L2 L2
611 AK0072-Cb AK0072-C KES Kenya Kilifi-Mbogolo F L2 L2
613 AK0073-Cb AK0073-C KES Kenya Kilifi-Mbogolo F L2 L2
615 AK0074-Cb AK0074-C KES Kenya Kilifi-Mbogolo F L2 L2
617 AK0075-Cb AK0075-C KES Kenya Kilifi-Mbogolo F L2 L2
618 AK0076-Ca AK0076-C KES Kenya Kilifi-Mbogolo F L2 L2
619 AK0076-Cb AK0076-C KES Kenya Kilifi-Mbogolo F L2 L2
621 AK0077-Cb AK0077-C KES Kenya Kilifi-Mbogolo F L2 L2
624 AK0079-Ca AK0079-C KES Kenya Kilifi-Mbogolo F L2 L2
625 AK0079-Cb AK0079-C KES Kenya Kilifi-Mbogolo F L2 L2
627 AK0080-Cb AK0080-C KES Kenya Kilifi-Mbogolo F L2 L2
659 AK0099-Cb AK0099-C KES Kenya Kilifi-Junju F L2 L2
667 AK0103-Cb AK0103-C KES Kenya Kilifi-Junju F L2 L2
677 AK0109-Cb AK0109-C KES Kenya Kilifi-Junju F L2 L2
683 AK0119-Cb AK0119-C KES Kenya Kilifi-Mbogolo F L2 L2

In [38]:
pandas.set_option('display.max_rows', 9999)
df_haps_out.groupby(by=('population', 'network_haplotype_group')).count()[['haplotype_id']]


Out[38]:
haplotype_id
population network_haplotype_group
AOM F1 89
FX 14
WT 17
BFM F1 110
FX 6
L1 19
WT 3
BFS F1 161
FX 1
CMS F1 34
F2 13
F3 50
F4 19
F5 163
FX 12
S2 8
S4 36
S5 34
SX 7
WT 174
GAS F4 16
F5 24
S2 70
SX 2
GNS F1 62
GWA WT 92
KES L2 16
S3 67
WT 5
UGS S1 107
S3 98
SX 1

In [39]:
df_haps_out.to_csv('../data/supp_table_haplotype_panel.csv', index=False)

Table of haplotype data


In [40]:
haps_tr_mac = haps_tr[df_tr.MAC > 14]
haps_tr_mac


Out[40]:
<HaplotypeArray shape=(771, 1530) dtype=int8>
01234...15251526152715281529
011111...11111
100000...00000
200000...00000
......
76800000...00000
76911011...11111
77000000...00000

In [41]:
df_hap_data = df_tr_mac[['CHROM', 'POS', 'REF', 'ALT']].merge(
    pandas.DataFrame(np.asarray(haps_tr_mac), columns=df_haps.label), 
    left_index=True, right_index=True)
df_hap_data.head()


Out[41]:
CHROM POS REF ALT AB0085-Ca AB0085-Cb AB0087-Ca AB0087-Cb AB0088-Ca AB0088-Cb ... AV0039-Ca AV0039-Cb AV0041-Ca AV0041-Cb AV0044-Ca AV0044-Cb AV0045-Ca AV0045-Cb AV0047-Ca AV0047-Cb
0 2L 2348815 T C 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
1 2L 2348824 T A 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 2L 2348869 T A 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 2L 2348877 G T 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 2L 2348929 A G 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1534 columns


In [42]:
df_hap_data.to_csv('../data/supp_table_haplotypes_tracking.csv', index=False)

All variation, for PCR flanks


In [43]:
callset = phase1_ar31.callset
callset


Out[43]:
<zarr.hierarchy.Group '/' read-only>

In [44]:
pos = allel.SortedIndex(callset['2L/variants/POS'])
pos


Out[44]:
<SortedIndex shape=(19453287,) dtype=int32>
01234...1945328219453283194532841945328519453286
103163192317343...4936171149361766493617914936181549362329

In [45]:
loc_cs_tr = pos.locate_range(region_vgsc.start - 10000, region_vgsc.end + 10000)
pos_cs_tr = pos[loc_cs_tr]
pos_cs_tr


Out[45]:
<SortedIndex shape=(8297,) dtype=int32>
01234...82928293829482958296
23481642348173234818923482192348253...24415872441589244159324416162441617

In [46]:
gt = allel.GenotypeArray(callset['2L/calldata/genotype'][loc_cs_tr])
gt


Out[46]:
<GenotypeArray shape=(8297, 765, 2) dtype=int8>
01234...760761762763764
00/0./../../../...../../../../../.
11/1./../../../...../../../../../.
21/1./../../../...../../../../../.
......
82940/00/00/00/00/0...0/00/00/00/00/0
82950/00/00/00/00/0...0/00/00/00/00/0
82960/00/00/00/00/0...0/00/00/00/00/0

In [47]:
df_samples = phase1_ar3.df_samples
df_samples.head()


Out[47]:
ox_code src_code sra_sample_accession population country region contributor contact year m_s ... pca_3L_free_pc3 pca_3L_free_pc4 pca_2La_pc1 pca_2La_pc2 pca_2La_pc3 pca_2La_pc4 pca_2Rb_pc1 pca_2Rb_pc2 pca_2Rb_pc3 pca_2Rb_pc4
index
0 AB0085-C BF2-4 ERS223996 BFS Burkina Faso Pala Austin Burt Sam O'Loughlin 2012 S ... -8.290940 -18.542768 -55.511389 32.682143 -1.833739 -0.381984 -38.934301 31.939383 19.345606 7.362004
1 AB0087-C BF3-3 ERS224013 BFM Burkina Faso Bana Austin Burt Sam O'Loughlin 2012 M ... -38.603076 76.915617 -51.856633 27.401249 2.586488 0.643100 -10.072879 29.007266 -21.736087 -30.309562
2 AB0088-C BF3-5 ERS223991 BFM Burkina Faso Bana Austin Burt Sam O'Loughlin 2012 M ... -35.553340 73.985488 -50.942456 28.572207 3.072583 -0.643137 12.281744 22.288417 -43.661301 -51.557140
3 AB0089-C BF3-8 ERS224031 BFM Burkina Faso Bana Austin Burt Sam O'Loughlin 2012 M ... -36.621568 76.453993 -51.169247 29.414975 2.198434 0.013562 -10.664072 29.601333 -16.831559 -25.452854
4 AB0090-C BF3-10 ERS223936 BFM Burkina Faso Bana Austin Burt Sam O'Loughlin 2012 M ... -36.097756 72.036309 -48.607416 26.407532 1.643226 0.102582 12.897113 22.194444 -48.882378 -52.420487

5 rows × 38 columns


In [48]:
subpops = {p: sorted(df_samples[df_samples.population == p].index.values)
           for p in df_samples.population.unique()}
acs = gt.count_alleles_subpops(subpops, max_allele=3)
acs['BFS']


Out[48]:
<AlleleCountsArray shape=(8297, 4) dtype=int32>
0123
022 0 0 0
1 022 0 0
2 032 0 0
......
8294162 0 0 0
8295162 0 0 0
8296162 0 0 0

In [49]:
afs = {p: acs[p].to_frequencies()[:, 1:].sum(axis=1) for p in df_samples.population.unique()}
afs


Out[49]:
{'BFS': array([ 0.,  1.,  1., ...,  0.,  0.,  0.]),
 'BFM': array([ 0.,  1.,  1., ...,  0.,  0.,  0.]),
 'UGS': array([ 0.,  1.,  0., ...,  0.,  0.,  0.]),
 'GWA': array([ 0.3       ,  0.9       ,  0.0625    , ...,  0.        ,
         0.02173913,  0.        ]),
 'KES': array([ 0.,  1.,  0., ...,  0.,  0.,  0.]),
 'CMS': array([ 0.        ,  1.        ,  0.06451613, ...,  0.        ,
         0.        ,  0.        ]),
 'AOM': array([ 0.        ,  1.        ,  0.8       , ...,  0.00833333,
         0.        ,  0.        ]),
 'GAS': array([ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.00892857]),
 'GNS': array([ 0.,  1.,  1., ...,  0.,  0.,  0.])}

In [50]:
columns = ([
    ('CHROM', callset['2L/variants/CHROM'][loc_cs_tr].astype('U')),
    ('POS', callset['2L/variants/POS'][loc_cs_tr]),
    ('REF', callset['2L/variants/REF'][loc_cs_tr].astype('U')),
    ('ALT_1', callset['2L/variants/ALT'][loc_cs_tr, 0].astype('U')),
    ('ALT_2', callset['2L/variants/ALT'][loc_cs_tr, 1].astype('U')),
    ('ALT_3', callset['2L/variants/ALT'][loc_cs_tr, 2].astype('U')),
    ('AC_1', callset['2L/variants/AC'][loc_cs_tr, 0]),
    ('AC_2', callset['2L/variants/AC'][loc_cs_tr, 1]),
    ('AC_3', callset['2L/variants/AC'][loc_cs_tr, 2]),
    ('AF_1', callset['2L/variants/AF'][loc_cs_tr, 0]),
    ('AF_2', callset['2L/variants/AF'][loc_cs_tr, 1]),
    ('AF_3', callset['2L/variants/AF'][loc_cs_tr, 2]),
    ('AF_total', callset['2L/variants/AF'][loc_cs_tr, :].sum(axis=1)),
    ('AN', callset['2L/variants/AN'][loc_cs_tr]),
    ('FILTER_PASS', callset['2L/variants/FILTER_PASS'][loc_cs_tr]),
] + [('AF_' + p, afs[p]) for p in df_samples.population.unique()] +
    [('MAX_POP_AF', np.column_stack([afs[p] for p in df_samples.population.unique()]).max(axis=1))]
)
df_cs_tr = pandas.DataFrame.from_items(columns)
df_cs_tr.head()


Out[50]:
CHROM POS REF ALT_1 ALT_2 ALT_3 AC_1 AC_2 AC_3 AF_1 ... AF_BFS AF_BFM AF_UGS AF_GWA AF_KES AF_CMS AF_AOM AF_GAS AF_GNS MAX_POP_AF
0 2L 2348164 G A 6 0 0 0.033997 ... 0.0 0.0 0.000000 0.300000 0.000000 0.000000 0.0 0.0 0.0 0.300000
1 2L 2348173 A T 188 0 0 0.988770 ... 1.0 1.0 1.000000 0.900000 1.000000 1.000000 1.0 1.0 1.0 1.000000
2 2L 2348189 T C 80 0 0 0.335938 ... 1.0 1.0 0.000000 0.062500 0.000000 0.064516 0.8 0.0 1.0 1.000000
3 2L 2348219 T C 36 0 0 0.098022 ... 0.0 0.0 0.522727 0.108696 0.090909 0.061224 0.0 0.0 0.0 0.522727
4 2L 2348253 C T 8 0 0 0.014000 ... 0.0 0.0 0.075000 0.035714 0.000000 0.000000 0.0 0.0 0.0 0.075000

5 rows × 25 columns


In [51]:
df_cs_tr.FILTER_PASS.describe()


Out[51]:
count      8297
unique        2
top       False
freq       5961
Name: FILTER_PASS, dtype: object

In [52]:
df_cs_tr.to_csv('../data/supp_table_variants_all.csv')

In [ ]: