This notebook weaves together the haplotype scaffold at biallelic sites phased via shapeit2 with the multiallelic and other extra (e.g., non-PASS) variants phased via mvncall.


In [1]:
%run setup.ipynb



In [2]:
# haplotype scaffold
callset_phased = phase1_ar31.callset_phased
gt_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])
pos_phased.shape, gt_phased.shape


Out[2]:
((8296600,), (8296600, 773, 2))

In [3]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 6000000)
loc_region


Out[3]:
slice(0, 341995, None)

In [4]:
# extract data for region, remove colony parents (8 samples) 
pos_phased_region = pos_phased[loc_region]
gt_phased_region = gt_phased[loc_region][:, :-8].compute()
pos_phased_region.shape, gt_phased_region.shape


Out[4]:
((341995,), (341995, 765, 2))

In [5]:
# load mvn haplotypes
callset_extras = np.load('../data/phasing_extra_phase1.mvncall.200.npz')
pos_extras = callset_extras['variants']['POS']
gt_extras = callset_extras['calldata']['genotype']
pos_extras.shape, gt_extras.shape


Out[5]:
((3,), (3, 765, 2))

In [6]:
# concatenate
gt_combined = np.concatenate([gt_phased_region, gt_extras], axis=0)
pos_combined = np.concatenate([pos_phased_region, pos_extras], axis=0)

# sort by position
idx_sorted = np.argsort(pos_combined)
gt_combined = gt_combined[idx_sorted]
pos_combined = pos_combined[idx_sorted]

In [7]:
# obtain data from unphased callset - only needed for variant annotations
callset = phase1_ar31.callset
pos_all = allel.SortedIndex(callset['2L/variants/POS'])
ann_all = callset['2L/variants/ANN'][:][['Annotation', 'HGVS_p', 'HGVS_c']]
ann_all


Out[7]:
array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [8]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_all.locate_intersection(pos_combined)
np.count_nonzero(loc1)


Out[8]:
341998

In [9]:
# extract annotations for the phased variants
ann_combined = ann_all[loc1]
ann_combined


Out[9]:
array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'upstream_gene_variant', b'.', b'n.-9G>A'),
       (b'upstream_gene_variant', b'.', b'n.-9T>G'),
       (b'upstream_gene_variant', b'.', b'n.-9G>T')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [10]:
# save
haps_combined = allel.GenotypeArray(gt_combined).to_haplotypes()
np.savez_compressed('../data/haps_phase1.npz', haplotypes=haps_combined, POS=pos_combined, ANN=ann_combined)

In [11]:
haps_combined.nbytes


Out[11]:
523256940

In [12]:
!ls -lh ../data/haps_phase1.npz


-rw-rw-r-- 1 chris chris 12M Dec  1 14:26 ../data/haps_phase1.npz

In [ ]: