In [1]:
%run setup.ipynb
%matplotlib inline
In [2]:
# load haplotypes
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
n_variants = haps.shape[0]
n_haps = haps.shape[1]
n_variants, n_haps
Out[2]:
In [3]:
list(callset_haps)
Out[3]:
In [4]:
callset_haps['ANN']
Out[4]:
In [5]:
callset_phased = phase1_ar31.callset_phased
sorted(callset_phased['2L/variants'])
Out[5]:
In [6]:
# load up haplotype group assignments from hierarchical clustering
hierarchical_group_membership = np.load('../data/hierarchical_cluster_membership.npy')
np.unique(hierarchical_group_membership)
Out[6]:
In [7]:
# load up haplotype group assignments from network analysis
network_group_membership = np.load('../data/median_joining_network_membership.npy')
network_group_membership[network_group_membership == ''] = 'WT'
np.unique(network_group_membership)
Out[7]:
In [8]:
# load up core haplotypes
with open('../data/core_haps.pkl', mode='rb') as f:
core_haps = pickle.load(f)
In [9]:
tbl_variants = etl.frompickle('../data/tbl_variants_phase1.pkl')
tbl_variants.head()
Out[9]:
In [10]:
transcript_ids = [
'AGAP004707-RA',
'AGAP004707-RB',
'AGAP004707-RC',
'Davies-C1N2',
'Davies-C3N2',
'Davies-C5N2',
'Davies-C7N2',
'Davies-C8N2',
'Davies-C10N2',
'Davies-C11N2',
'Davies-C1N9',
'Davies-C8N9',
'Davies-C1N9ck'
]
In [11]:
#load the codon map from the blog post (with the header info removed)
md_tbl = etl.fromtsv('../data/domestica_gambiae_map.txt')
md_tbl
Out[11]:
In [12]:
dom_fs = pyfasta.Fasta('../data/domestica_gambiae_PROT_MEGA.fas')
#grab the right sample
dom = dom_fs.get('domestica_vgsc')
#remove the '-' from the aligned fasta so the numbering makes sense
dom_fix = [p for p in dom if p != '-']
#check
dom_fix[261-1],dom_fix[1945-1]
Out[12]:
In [13]:
# keep only the missense variants
def simplify_missense_effect(v):
if v and v[0] == 'NON_SYNONYMOUS_CODING':
return v[1]
else:
return ''
tbl_variants_missense = (
tbl_variants
.select(lambda row: any(row[t] and row[t][0] == 'NON_SYNONYMOUS_CODING' for t in transcript_ids))
.convert(transcript_ids, simplify_missense_effect)
.capture('AGAP004707-RA', pattern="([0-9]+)", newfields=['gambiae_codon'], include_original=True, fill=[''])
.hashleftjoin(md_tbl, key='gambiae_codon', missing='')
.replace('domestica_codon', '.', '')
.convert('domestica_codon', lambda v: dom_fix[int(v) - 1] + v if v else v)
.cut('CHROM', 'POS', 'REF', 'ALT', 'AC', 'exon', 'domestica_codon',
'AGAP004707-RA', 'AGAP004707-RB', 'AGAP004707-RC', 'Davies-C1N2', 'Davies-C3N2', 'Davies-C5N2',
'Davies-C7N2', 'Davies-C8N2', 'Davies-C10N2', 'Davies-C11N2', 'Davies-C1N9', 'Davies-C8N9', 'Davies-C1N9ck',
'AF_AOM', 'AF_BFM', 'AF_GWA', 'AF_GNS', 'AF_BFS', 'AF_CMS', 'AF_GAS', 'AF_UGS', 'AF_KES',
'FILTER_PASS', 'NoCoverage', 'LowCoverage', 'HighCoverage', 'LowMQ', 'HighMQ0', 'RepeatDUST', 'RepeatMasker', 'RepeatTRF', 'FS', 'HRun', 'QD', 'ReadPosRankSum',
)
)
tbl_variants_missense.displayall()
In [14]:
tbl_variants_missense.nrows()
Out[14]:
In [15]:
tbl_variants_missense.tocsv('../data/supp_table_variants_missense.csv')
One row per variant. Only biallelic obviously because haplotype tagging.
Columns:
In [16]:
list(callset_phased['2L/variants'])
Out[16]:
In [17]:
region_vgsc
Out[17]:
In [18]:
pos = allel.SortedIndex(callset_phased['2L/variants/POS'])
pos
Out[18]:
In [19]:
# "_tr" means tracking region, i.e., genome region we'll report data for tracking SNPs
loc_tr = pos.locate_range(region_vgsc.start - 10000, region_vgsc.end + 10000)
loc_tr
Out[19]:
In [20]:
pos_tr = pos[loc_tr]
pos_tr
Out[20]:
In [21]:
haps_tr = allel.GenotypeArray(callset_phased['2L/calldata/genotype'][loc_tr, :-8]).to_haplotypes()
haps_tr
Out[21]:
In [22]:
ac_tr = haps_tr.count_alleles(max_allele=1)
ac_tr
Out[22]:
In [23]:
af_tr = ac_tr.to_frequencies()
af_tr
Out[23]:
In [24]:
subpops = dict()
for p in 'F1 F2 F3 F4 F5 S1 S2 S3 S4 S5 L1 L2 WT'.split():
subpops[p] = np.nonzero(network_group_membership == p)[0]
sorted((k, len(v)) for k, v in subpops.items())
Out[24]:
In [25]:
ac_subpops_tr = haps_tr.count_alleles_subpops(subpops, max_allele=1)
In [26]:
af_subpops_tr = {k: ac.to_frequencies()[:, 1] for k, ac in ac_subpops_tr.items()}
In [27]:
columns = [
('CHROM', np.asarray(['2L'] * haps_tr.shape[0], dtype=object)),
('POS', callset_phased['2L/variants/POS'][loc_tr]),
('REF', callset_phased['2L/variants/REF'][loc_tr].astype('U')),
('ALT', callset_phased['2L/variants/ALT'][loc_tr].astype('U')),
('AC', ac_tr[:, 1]),
('AF', af_tr[:, 1]),
('MAC', ac_tr.min(axis=1)),
('MAF', af_tr.min(axis=1)),
] + sorted(('AF_' + k, v) for k, v in af_subpops_tr.items())
df_tr = pandas.DataFrame.from_items(columns)
df_tr.head()
Out[27]:
In [28]:
df_tr_mac = df_tr[df_tr.MAC > 14].reset_index(drop=True)
df_tr_mac.head()
Out[28]:
In [29]:
# check how many potentially diagnostics SNPs separating each pair of F haplotype groups
for x, y in itertools.combinations('F1 F2 F3 F4 F5'.split(), 2):
print(x, y, np.count_nonzero(np.abs(df_tr_mac['AF_' + x] - df_tr_mac['AF_' + y]) > 0.98))
In [30]:
# check how many potentially diagnostics SNPs separating each pair of S haplotype groups
for x, y in itertools.combinations('S1 S2 S3 S4 S5'.split(), 2):
print(x, y, np.count_nonzero(np.abs(df_tr_mac['AF_' + x] - df_tr_mac['AF_' + y]) > 0.98))
In [31]:
df_tr_mac.to_csv('../data/supp_table_variants_tracking.csv', index=False, float_format='%.3f')
In [32]:
df_haps = pandas.read_csv('../data/ag1000g.phase1.AR3.1.haplotypes.meta.txt', sep='\t', index_col='index')[:-16]
df_haps.head()
Out[32]:
In [33]:
n_haps
Out[33]:
In [34]:
core_hap_col = np.empty(n_haps, dtype=object)
for k, s in core_haps.items():
core_hap_col[sorted(s)] = k
core_hap_col
Out[34]:
In [35]:
df_haps_out = (
df_haps[['label', 'ox_code', 'population', 'country', 'region', 'sex']]
.rename(columns={'label': 'haplotype_id', 'ox_code': 'sample_id'})
.copy()
)
df_haps_out['core_haplotype'] = core_hap_col
df_haps_out['network_haplotype_group'] = network_group_membership
df_haps_out['hierarchy_haplotype_group'] = hierarchical_group_membership.astype('U')
df_haps_out
Out[35]:
In [36]:
df_haps_out[df_haps_out.core_haplotype == 'L1']
Out[36]:
In [37]:
df_haps_out[df_haps_out.core_haplotype == 'L2']
Out[37]:
In [38]:
pandas.set_option('display.max_rows', 9999)
df_haps_out.groupby(by=('population', 'network_haplotype_group')).count()[['haplotype_id']]
Out[38]:
In [39]:
df_haps_out.to_csv('../data/supp_table_haplotype_panel.csv', index=False)
In [40]:
haps_tr_mac = haps_tr[df_tr.MAC > 14]
haps_tr_mac
Out[40]:
In [41]:
df_hap_data = df_tr_mac[['CHROM', 'POS', 'REF', 'ALT']].merge(
pandas.DataFrame(np.asarray(haps_tr_mac), columns=df_haps.label),
left_index=True, right_index=True)
df_hap_data.head()
Out[41]:
In [42]:
df_hap_data.to_csv('../data/supp_table_haplotypes_tracking.csv', index=False)
In [43]:
callset = phase1_ar31.callset
callset
Out[43]:
In [44]:
pos = allel.SortedIndex(callset['2L/variants/POS'])
pos
Out[44]:
In [45]:
loc_cs_tr = pos.locate_range(region_vgsc.start - 10000, region_vgsc.end + 10000)
pos_cs_tr = pos[loc_cs_tr]
pos_cs_tr
Out[45]:
In [46]:
gt = allel.GenotypeArray(callset['2L/calldata/genotype'][loc_cs_tr])
gt
Out[46]:
In [47]:
df_samples = phase1_ar3.df_samples
df_samples.head()
Out[47]:
In [48]:
subpops = {p: sorted(df_samples[df_samples.population == p].index.values)
for p in df_samples.population.unique()}
acs = gt.count_alleles_subpops(subpops, max_allele=3)
acs['BFS']
Out[48]:
In [49]:
afs = {p: acs[p].to_frequencies()[:, 1:].sum(axis=1) for p in df_samples.population.unique()}
afs
Out[49]:
In [50]:
columns = ([
('CHROM', callset['2L/variants/CHROM'][loc_cs_tr].astype('U')),
('POS', callset['2L/variants/POS'][loc_cs_tr]),
('REF', callset['2L/variants/REF'][loc_cs_tr].astype('U')),
('ALT_1', callset['2L/variants/ALT'][loc_cs_tr, 0].astype('U')),
('ALT_2', callset['2L/variants/ALT'][loc_cs_tr, 1].astype('U')),
('ALT_3', callset['2L/variants/ALT'][loc_cs_tr, 2].astype('U')),
('AC_1', callset['2L/variants/AC'][loc_cs_tr, 0]),
('AC_2', callset['2L/variants/AC'][loc_cs_tr, 1]),
('AC_3', callset['2L/variants/AC'][loc_cs_tr, 2]),
('AF_1', callset['2L/variants/AF'][loc_cs_tr, 0]),
('AF_2', callset['2L/variants/AF'][loc_cs_tr, 1]),
('AF_3', callset['2L/variants/AF'][loc_cs_tr, 2]),
('AF_total', callset['2L/variants/AF'][loc_cs_tr, :].sum(axis=1)),
('AN', callset['2L/variants/AN'][loc_cs_tr]),
('FILTER_PASS', callset['2L/variants/FILTER_PASS'][loc_cs_tr]),
] + [('AF_' + p, afs[p]) for p in df_samples.population.unique()] +
[('MAX_POP_AF', np.column_stack([afs[p] for p in df_samples.population.unique()]).max(axis=1))]
)
df_cs_tr = pandas.DataFrame.from_items(columns)
df_cs_tr.head()
Out[50]:
In [51]:
df_cs_tr.FILTER_PASS.describe()
Out[51]:
In [52]:
df_cs_tr.to_csv('../data/supp_table_variants_all.csv')
In [ ]: