This notebook weaves together the haplotype scaffold at biallelic sites phased via shapeit2 with the multiallelic and other extra (e.g., non-PASS) variants phased via mvncall.
In [1]:
%run setup.ipynb
In [2]:
# haplotype scaffold
callset_phased = phase1_ar31.callset_phased
gt_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])
pos_phased.shape, gt_phased.shape
Out[2]:
In [3]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 6000000)
loc_region
Out[3]:
In [4]:
# extract data for region, remove colony parents (8 samples)
pos_phased_region = pos_phased[loc_region]
gt_phased_region = gt_phased[loc_region][:, :-8].compute()
pos_phased_region.shape, gt_phased_region.shape
Out[4]:
In [5]:
# load mvn haplotypes
callset_extras = np.load('../data/phasing_extra_phase1.mvncall.200.npz')
pos_extras = callset_extras['variants']['POS']
gt_extras = callset_extras['calldata']['genotype']
pos_extras.shape, gt_extras.shape
Out[5]:
In [6]:
# concatenate
gt_combined = np.concatenate([gt_phased_region, gt_extras], axis=0)
pos_combined = np.concatenate([pos_phased_region, pos_extras], axis=0)
# sort by position
idx_sorted = np.argsort(pos_combined)
gt_combined = gt_combined[idx_sorted]
pos_combined = pos_combined[idx_sorted]
In [7]:
# obtain data from unphased callset - only needed for variant annotations
callset = phase1_ar31.callset
pos_all = allel.SortedIndex(callset['2L/variants/POS'])
ann_all = callset['2L/variants/ANN'][:][['Annotation', 'HGVS_p', 'HGVS_c']]
ann_all
Out[7]:
In [8]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_all.locate_intersection(pos_combined)
np.count_nonzero(loc1)
Out[8]:
In [9]:
# extract annotations for the phased variants
ann_combined = ann_all[loc1]
ann_combined
Out[9]:
In [10]:
# save
haps_combined = allel.GenotypeArray(gt_combined).to_haplotypes()
np.savez_compressed('../data/haps_phase1.npz', haplotypes=haps_combined, POS=pos_combined, ANN=ann_combined)
In [11]:
haps_combined.nbytes
Out[11]:
In [12]:
!ls -lh ../data/haps_phase1.npz
In [ ]: