In [1]:
%run ../../../shared_setup.ipynb
In [2]:
truth_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/truth'
In [3]:
gatk_callset_fn_template = '/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/{cross}.gatk.final.npz'
In [4]:
def trim_alleles(a, b):
# SNPs or already trimmed
if len(a) == 1 or len(b) == 1:
return a, b
# sort by length, shortest first
reverse = False
if len(a) > len(b):
a, b = b, a
reverse = True
# pick off reference base
ref_base = a[0]
assert a[0] == b[0], (a, b)
# pick off suffix
a_suffix = a[1:]
b_suffix = b[1:]
# trim
if b_suffix.endswith(a_suffix):
a = ref_base
b = ref_base + b_suffix[:-1*len(a_suffix)]
if reverse:
return b, a
else:
return a, b
In [5]:
def tabulate_variants_gatk(cross, parent):
callset_gatk = np.load(gatk_callset_fn_template.format(cross=cross))
variants = callset_gatk['variants']
# select variants
filter_condition = numexpr.evaluate('~FILTER_CNV & '
'~FILTER_DUP_SITE & '
'~FILTER_LOW_CONFIDENCE & '
'~FILTER_LOW_CONFIDENCE_PARENT & '
'~FILTER_MISSING_PARENT & '
'~FILTER_NON_CORE & '
'~FILTER_NON_MENDELIAN',
local_dict=variants)
log(callset_gatk['calldata'].dtype.names[parent])
c2d = vcfnp.view2d(callset_gatk['calldata'])
genotype = c2d['genotype']
genotype_parent = genotype[:, parent]
genotype_condition = genotype_parent > 0
condition = filter_condition & genotype_condition
log('n_variants', nnz(condition))
# apply selection
variants = np.compress(condition, variants)
genotype_parent = np.compress(condition, genotype_parent)
# construct table
tbl = (etl
.fromarray(variants)
.addcolumn('genotype', genotype_parent)
.cut('CHROM', 'POS', 'REF', 'ALT', 'genotype')
.convert('ALT', lambda v, row: v[row.genotype-1], pass_row=True)
.cutout('genotype')
.convert(['CHROM', 'REF', 'ALT'], lambda v: str(v, 'ascii'))
.addfield('svlen', lambda row: len(row.ALT) - len(row.REF))
.addfield('trim', lambda row: trim_alleles(row.REF, row.ALT))
.unpack('trim', ['REF_trim', 'ALT_trim'])
.addfield('svlen_trim', lambda row: len(row.ALT_trim) - len(row.REF_trim))
.cutout('REF', 'ALT', 'svlen')
.rename({'REF_trim': 'REF', 'ALT_trim': 'ALT', 'svlen_trim': 'svlen'})
.addfield('svtype', lambda row: 'SNP' if row.svlen == 0 else 'INS' if row.svlen > 0 else 'DEL')
.addfield('discovery', True)
)
return tbl
In [6]:
def load_coverage(alignment_method, assembly, rebuild=False):
bam_fn = os.path.join(truth_dir, alignment_method, 'alignment', assembly + '.bam')
cov_fn = bam_fn + '.coverage.npy'
if not os.path.exists(cov_fn) or rebuild:
cov = pysamstats.load_coverage(bam_fn,
pad=True,
fields=['chrom', 'pos', 'reads_all'],
one_based=True)
np.save(cov_fn, cov)
else:
cov = np.load(cov_fn).view(np.recarray)
return cov
In [7]:
def tabulate_variants_truth(alignment_method, calling_method, assembly):
truth_vcf_fn = os.path.join(truth_dir, alignment_method, 'calling', calling_method, assembly + '.leftaligned.vcf.gz')
# extract variants
truth_variants = vcfnp.variants(truth_vcf_fn,
fields=['CHROM', 'POS', 'REF', 'ALT'],
dtypes={'REF': 'S200', 'ALT': 'S200'},
arities={'ALT': 4},
cache=False)
# extract genotype
truth_genotype = vcfnp.calldata_2d(truth_vcf_fn,
fields=['genotype'],
ploidy=1,
cache=False)['genotype'][:, 0]
# tabulate
tbl = (etl
.fromarray(truth_variants)
.addcolumn('genotype', truth_genotype)
.convert('ALT', lambda v, row: v[row.genotype-1], pass_row=True)
.cutout('genotype')
.convert(['CHROM', 'REF', 'ALT'], lambda v: str(v, 'ascii'))
.addfield('svlen', lambda row: len(row.ALT) - len(row.REF))
.addfield('trim', lambda row: trim_alleles(row.REF, row.ALT))
.unpack('trim', ['REF_trim', 'ALT_trim'])
.addfield('svlen_trim', lambda row: len(row.ALT_trim) - len(row.REF_trim))
.cutout('REF', 'ALT', 'svlen')
.rename({'REF_trim': 'REF', 'ALT_trim': 'ALT', 'svlen_trim': 'svlen'})
.intervalleftjoin(tbl_regions_1b, lkey='CHROM', rkey='region_chrom',
lstart='POS', lstop='POS', rstart='region_start', rstop='region_stop',
include_stop=True)
.eq('region_type', 'Core')
.cutout('region_chrom', 'region_start', 'region_stop', 'region_type', 'region_size')
.addfield('svtype', lambda row: 'SNP' if row.svlen == 0 else 'INS' if row.svlen > 0 else 'DEL')
.addfield('truth', True)
)
return tbl
In [12]:
def join_truth(tbl_discovery, alignment_method, calling_method, assembly, key=('CHROM', 'POS')):
# tabulate truth variants
tbl_truth = tabulate_variants_truth(alignment_method, calling_method, assembly)
# load coverage from truth assembly
cov = load_coverage(alignment_method, assembly)
cov_idx = allel.SortedMultiIndex(cov['chrom'], cov['pos'], copy=False)
# tbl_cov = etl.fromarray(cov).convert('chrom', lambda v: str(v, 'ascii')).rename({'chrom': 'CHROM', 'pos': 'POS'})
# tabulate
tbl = (
tbl_discovery
.outerjoin(tbl_truth, key=key, rprefix='truth_')
.rename('truth_truth', 'truth')
.addfield('truth_coverage', lambda row: cov.reads_all[cov_idx.locate_key(row.CHROM.encode('ascii'), row.POS)])
.eq('truth_coverage', 1)
.addfield('status', lambda row: 'FP' if row.truth is None else 'FN' if row.discovery is None else 'TP')
.cutout('discovery', 'truth')
.intervaljoinvalues(tbl_genes, value='feature_id', lkey='CHROM', lstart='POS', lstop='POS',
rkey='feature_chrom', rstart='feature_start', rstop='feature_stop', include_stop=True)
.rename('feature_id', 'gene')
.convert('gene', lambda v: v[0] if v else None)
.intervaljoinvalues(tbl_exons, value='feature_id', lkey='CHROM', lstart='POS', lstop='POS',
rkey='feature_chrom', rstart='feature_start', rstop='feature_stop', include_stop=True)
.rename('feature_id', 'is_coding')
.convert('is_coding', lambda v: len(v) > 0)
.cache(100000)
)
return tbl
In [13]:
def analyse_confusion(tbl_joined):
df = tbl_joined.cut('svtype', 'truth_svtype', 'status', 'is_coding').todataframe()
status = df.status
is_coding = df.is_coding
is_snp = (df.svtype == 'SNP')
is_indel = (df.svtype == 'INS') | (df.svtype == 'DEL')
is_truth_snp = (df.truth_svtype == 'SNP')
is_truth_indel = (df.truth_svtype == 'INS') | (df.truth_svtype == 'DEL')
tbl_confusion = [['svtype', 'n', 'TP', 'FP', 'FN', 'FDR', 'sensitivity']]
fig = plt.figure(figsize=(8, 6))
# all variants analyse confusion
fp = nnz(status == 'FP')
fn = nnz(status == 'FN')
tp = nnz(status == 'TP')
n = fp + fn + tp
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
sens = tp / (fn + tp) if (fn + tp) > 0 else 0
tbl_confusion.append(['all', n, tp, fp, fn, fdr, sens])
# coding SNP analyse confusion
fp = nnz(is_coding & is_snp & (status == 'FP'))
fn = nnz(is_coding & is_truth_snp & (status == 'FN'))
tp = nnz(is_coding & is_snp & (status == 'TP'))
n = fp + fn + tp
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
sens = tp / (fn + tp) if (fn + tp) > 0 else 0
tbl_confusion.append(['SNP coding', n, tp, fp, fn, fdr, sens])
from matplotlib_venn import venn2
ax = fig.add_subplot(2, 2, 1)
v = venn2(subsets=[fp, fn, tp], set_labels=['discovery', 'truth'], ax=ax)
v.get_patch_by_id('10').set_color('#ff4444')
v.get_patch_by_id('01').set_color('#4444ff')
plt.gca().set_title('SNP coding')
# non-coding SNP analyse confusion
fp = nnz(~is_coding & is_snp & (status == 'FP'))
fn = nnz(~is_coding & is_truth_snp & (status == 'FN'))
tp = nnz(~is_coding & is_snp & (status == 'TP'))
n = fp + fn + tp
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
sens = tp / (fn + tp) if (fn + tp) > 0 else 0
tbl_confusion.append(['SNP non-coding', n, tp, fp, fn, fdr, sens])
from matplotlib_venn import venn2
ax = fig.add_subplot(2, 2, 2)
v = venn2(subsets=[fp, fn, tp], set_labels=['discovery', 'truth'], ax=ax)
v.get_patch_by_id('10').set_color('#ff4444')
v.get_patch_by_id('01').set_color('#4444ff')
plt.gca().set_title('SNP non-coding')
# coding INDEL analyse confusion
fp = nnz(is_coding & is_indel & (status == 'FP'))
fn = nnz(is_coding & is_truth_indel & (status == 'FN'))
tp = nnz(is_coding & is_indel & (status == 'TP'))
n = fp + fn + tp
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
sens = tp / (fn + tp) if (fn + tp) > 0 else 0
tbl_confusion.append(['INDEL coding', n, tp, fp, fn, fdr, sens])
from matplotlib_venn import venn2
ax = fig.add_subplot(2, 2, 3)
v = venn2(subsets=[fp, fn, tp], set_labels=['discovery', 'truth'], ax=ax)
v.get_patch_by_id('10').set_color('#ff4444')
v.get_patch_by_id('01').set_color('#4444ff')
plt.gca().set_title('INDEL coding')
# non-coding INDEL analyse confusion
fp = nnz(~is_coding & is_indel & (status == 'FP'))
fn = nnz(~is_coding & is_truth_indel & (status == 'FN'))
tp = nnz(~is_coding & is_indel & (status == 'TP'))
n = fp + fn + tp
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
sens = tp / (fn + tp) if (fn + tp) > 0 else 0
tbl_confusion.append(['INDEL non-coding', n, tp, fp, fn, fdr, sens])
from matplotlib_venn import venn2
ax = fig.add_subplot(2, 2, 4)
v = venn2(subsets=[fp, fn, tp], set_labels=['discovery', 'truth'], ax=ax)
v.get_patch_by_id('10').set_color('#ff4444')
v.get_patch_by_id('01').set_color('#4444ff')
plt.gca().set_title('INDEL non-coding')
tbl_confusion = etl.wrap(tbl_confusion)
tbl_confusion.displayall()
In [14]:
def confusion_tr_style(row):
style = 'background-color: %s' % ('#4f4' if row.status == 'TP'
else '#f44' if row.status == 'FP'
else '#44f' if row.status == 'FN'
else 'white')
return style
In [15]:
# bwamem_intractg
# gatk_ug vs bcftools_multiallelic
tbl_variants_check1 = tabulate_variants_truth(alignment_method='bwamem_intractg',
calling_method='gatk_ug',
assembly='garimella_3d7_ERR019061_contigs').cutout('truth').addfield('discovery', True)
tbl_variants_check2 = join_truth(tbl_variants_check1,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='garimella_3d7_ERR019061_contigs',
key=('CHROM', 'POS', 'REF', 'ALT'))
log(tbl_variants_check2.nrows())
tbl_variants_check2.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_check2.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_check2)
In [16]:
# bwamem_intractg
# gatk_ug vs bcftools_consensus
tbl_variants_check1 = tabulate_variants_truth(alignment_method='bwamem_intractg',
calling_method='gatk_ug',
assembly='garimella_3d7_ERR019061_contigs').cutout('truth').addfield('discovery', True)
tbl_variants_check2 = join_truth(tbl_variants_check1,
alignment_method='bwamem_intractg',
calling_method='bcftools_consensus',
assembly='garimella_3d7_ERR019061_contigs',
key=('CHROM', 'POS', 'REF', 'ALT'))
tbl_variants_check2.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_check2.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_check2)
In [17]:
# bwamem_intractg
# bctfools_consensus vs bcftools_multiallelic
tbl_variants_check1 = tabulate_variants_truth(alignment_method='bwamem_intractg',
calling_method='bcftools_consensus',
assembly='garimella_3d7_ERR019061_contigs').cutout('truth').addfield('discovery', True)
tbl_variants_check2 = join_truth(tbl_variants_check1,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='garimella_3d7_ERR019061_contigs',
key=('CHROM', 'POS', 'REF', 'ALT'))
tbl_variants_check2.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_check2.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_check2)
In [18]:
# bwamem_intractg
# gatk_ug vs bcftools_multiallelic
tbl_variants_check1 = tabulate_variants_truth(alignment_method='bwamem_intractg',
calling_method='gatk_ug',
assembly='genbank_hb3_coding_sequences').cutout('truth').addfield('discovery', True)
tbl_variants_check2 = join_truth(tbl_variants_check1,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='genbank_hb3_coding_sequences',
key=('CHROM', 'POS', 'REF', 'ALT'))
log(tbl_variants_check2.nrows())
tbl_variants_check2.valuecounts('gene').sort('gene').displayall()
tbl_variants_check2.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_check2.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_check2)
In [19]:
# bwamem_intractg
# bcftools_consensus vs bcftools_multiallelic
tbl_variants_check1 = tabulate_variants_truth(alignment_method='bwamem_intractg',
calling_method='bcftools_consensus',
assembly='genbank_hb3_coding_sequences').cutout('truth').addfield('discovery', True)
tbl_variants_check2 = join_truth(tbl_variants_check1,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='genbank_hb3_coding_sequences',
key=('CHROM', 'POS', 'REF', 'ALT'))
log(tbl_variants_check2.nrows())
tbl_variants_check2.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_check2.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_check2)
In [20]:
tbl_variants_3d7_gatk = tabulate_variants_gatk('3d7_hb3', 0)
tbl_variants_3d7_gatk
Out[20]:
In [21]:
tbl_variants_3d7_gatk_vs_garimella_allele = join_truth(
tbl_variants_3d7_gatk,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='garimella_3d7_ERR019061_contigs',
key=('CHROM', 'POS', 'REF', 'ALT')
)
log(tbl_variants_3d7_gatk_vs_garimella_allele.nrows())
tbl_variants_3d7_gatk_vs_garimella_allele.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_3d7_gatk_vs_garimella_allele.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_3d7_gatk_vs_garimella_allele)
In [22]:
tbl_variants_hb31_gatk = tabulate_variants_gatk('3d7_hb3', 1)
tbl_variants_hb31_gatk
Out[22]:
In [23]:
tbl_variants_hb31_gatk_vs_garimella_allele = join_truth(
tbl_variants_hb31_gatk,
alignment_method='bwamem_intractg',
calling_method='bcftools_multiallelic',
assembly='garimella_hb3_ERR019054_contigs',
key=('CHROM', 'POS', 'REF', 'ALT')
)
log(tbl_variants_hb31_gatk_vs_garimella_allele.nrows())
tbl_variants_hb31_gatk_vs_garimella_allele.eq('status', 'FP').display(5, tr_style=confusion_tr_style)
tbl_variants_hb31_gatk_vs_garimella_allele.eq('status', 'FN').display(5, tr_style=confusion_tr_style)
analyse_confusion(tbl_variants_hb31_gatk_vs_garimella_allele)
In [24]:
tbl_variants_hb31_gatk_vs_genbank_site = join_truth(tbl_variants_hb31_gatk,
alignment_method='bwamem_intractg',
calling_method='bcftools_consensus',
assembly='genbank_hb3_coding_sequences',
key=('CHROM', 'POS'))
tbl_variants_hb31_gatk_vs_genbank_site.display(20, tr_style=confusion_tr_style)
In [25]:
tbl_variants_hb31_gatk_vs_genbank_site.valuecounts('gene').displayall()
In [26]:
analyse_confusion(tbl_variants_hb31_gatk_vs_genbank_site)
In [27]:
tbl_variants_hb31_gatk_vs_genbank_site.select(lambda row: row.svtype in {'INS', 'DEL'} or row.truth_svtype in {'INS', 'DEL'}).displayall(tr_style=confusion_tr_style)
In [ ]: