In [2]:
%matplotlib inline
import re
import sys
import math
import matplotlib
import seaborn
import numpy
from matplotlib import pyplot as plt
from collections import defaultdict
from evalutils import IntervalForest, populate_index_from_simulation, compact
from evalutils import subset_variants, load_kevlar_vcf, load_gatk_mvf, load_triodenovo_vcf
import kevlar
seaborn.set_context({'figure.figsize': (24, 12)})
matplotlib.rcParams['axes.labelsize'] = 16
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14
In [3]:
def roc(calls, index, delta=10, fmt='vcf'):
ncorrect = 0
num_true_calls_per_false_call = list()
for varcall in calls:
if fmt == 'vcf':
valid = index.query(varcall.seqid, varcall.position, delta=delta) != set()
elif fmt == 'mvf':
callindex, call = varcall
valid = index.query(call['CHROM'], call['POS'], delta=delta) != set()
else:
raise ValueError('unknown format "'+ fmt +'"')
if valid:
ncorrect += 1
continue
num_true_calls_per_false_call.append(ncorrect)
if len(num_true_calls_per_false_call) == 0 or ncorrect > num_true_calls_per_false_call[-1]:
num_true_calls_per_false_call.append(ncorrect)
return num_true_calls_per_false_call
In [4]:
def doplot(axis, data, color, label, linestyle, symbol):
if len(data) == 1:
axis.plot(range(len(data)), data, symbol, color=color, label=label)
else:
axis.plot(range(len(data)), data, color=color, linestyle=linestyle, label=label)
In [5]:
delta = 10
for cov1, cov2 in (('10', '20'), ('30', '50')):
categories = [
('SNV', None, None, 'SNVs'),
('INDEL', 1, 10, 'INDELs 1-10bp'),
('INDEL', 11, 100, 'INDELs 11-100bp'),
('INDEL', 101, 200, 'INDELs 101-200bp'),
('INDEL', 201, 300, 'INDELs 201-300bp'),
('INDEL', 301, 400, 'INDELs 301-400bp'),
]
fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
axes = (ax11, ax12, ax13, ax21, ax22, ax23)
seaborn.set_context({'figure.figsize': (24, 12)})
for i, (category, axis) in enumerate(zip(categories, axes)):
vartype, minlength, maxlength, label = category
with kevlar.open('SimulatedVariants_chr17_hg38.tsv.gz', 'r') as instream:
variants = subset_variants(instream, vartype, minlength, maxlength)
index = populate_index_from_simulation(variants, 'chr17')
for cov, linestyle in zip((cov1, cov2), (':', '-')):
kevlar_truecalls = roc(
load_kevlar_vcf(
'kevlar_calls_'+ cov +'x.vcf.gz', index, delta=delta,
vartype=vartype, minlength=minlength, maxlength=maxlength
),
index, delta=delta, fmt='vcf'
)
gatk_truecalls = roc(
load_gatk_mvf(
'GATK_calls_'+ cov +'x.mvf.gz',
vartype=vartype, minlength=minlength, maxlength=maxlength
).iterrows(),
index, delta=delta, fmt='mvf'
)
triodenovo_truecalls = roc(
load_triodenovo_vcf(
'triodenovo_calls_'+ cov +'x.vcf.gz', cov=cov,
vartype=vartype, minlength=minlength, maxlength=maxlength,
),
index, delta=delta, fmt='vcf'
)
doplot(axis, kevlar_truecalls, 'red', 'kevlar ({}x)'.format(cov), linestyle, 'o')
doplot(axis, gatk_truecalls, 'blue', 'GATK+PBT ({}x)'.format(cov), linestyle, 's')
doplot(axis, triodenovo_truecalls, 'green', 'triodenovo ({}x)'.format(cov), linestyle, 'p')
nvariants = len(index.trees['chr17'])
ticknums = [0, math.ceil(nvariants * 0.25), int(nvariants * 0.5), math.ceil(nvariants * 0.75), nvariants]
ticklabels = ['{:d}%\n({:d})'.format(int(tn / nvariants * 100), tn) for tn in ticknums]
_ = axis.set_xlabel('False calls', fontsize=16)
_ = axis.set_yticks(ticknums)
_ = axis.set_yticklabels(ticklabels)
_ = axis.set_ylabel('True calls', fontsize=16)
_ = axis.set_ylim((0, nvariants))
_ = axis.set_title(label, fontsize=18)
if i == 0:
_ = axis.legend(fontsize=14)
_ = plt.subplots_adjust(hspace=0.3)
_ = plt.savefig('three-callers-'+ cov +'x-combined.png', dpi=300)
_ = plt.show()
In [6]:
!./evaluate.py --do-all - -
In [7]:
delta = 10
categories = [
('SNV', None, None, 'SNVs'),
('INDEL', 1, 10, 'INDELs 1-10bp'),
('INDEL', 11, 100, 'INDELs 11-100bp'),
('INDEL', 101, 200, 'INDELs 101-200bp'),
('INDEL', 201, 300, 'INDELs 201-300bp'),
('INDEL', 301, 400, 'INDELs 301-400bp'),
]
coverages = ('10', '20', '30', '50')
linestyles = (':', '-.', '--', '-')
fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
axes = (ax11, ax12, ax13, ax21, ax22, ax23)
seaborn.set_context({'figure.figsize': (24, 12)})
for i, (category, axis) in enumerate(zip(categories, axes)):
vartype, minlength, maxlength, label = category
with kevlar.open('SimulatedVariants_chr17_hg38.tsv.gz', 'r') as instream:
variants = subset_variants(instream, vartype, minlength, maxlength)
index = populate_index_from_simulation(variants, 'chr17')
for cov, linestyle in zip(coverages, linestyles):
kevlar_truecalls = roc(
load_kevlar_vcf(
'kevlar_calls_'+ cov +'x.vcf.gz', index, delta=delta,
vartype=vartype, minlength=minlength, maxlength=maxlength
),
index, delta=delta, fmt='vcf'
)
gatk_truecalls = roc(
load_gatk_mvf(
'GATK_calls_'+ cov +'x.mvf.gz',
vartype=vartype, minlength=minlength, maxlength=maxlength
).iterrows(),
index, delta=delta, fmt='mvf'
)
triodenovo_truecalls = roc(
load_triodenovo_vcf(
'triodenovo_calls_'+ cov +'x.vcf.gz', cov=cov,
vartype=vartype, minlength=minlength, maxlength=maxlength,
),
index, delta=delta, fmt='vcf'
)
doplot(axis, kevlar_truecalls, 'red', 'kevlar ({}x)'.format(cov), linestyle, 'o')
doplot(axis, gatk_truecalls, 'blue', 'GATK+PBT ({}x)'.format(cov), linestyle, 's')
doplot(axis, triodenovo_truecalls, 'green', 'triodenovo ({}x)'.format(cov), linestyle, 'p')
nvariants = len(index.trees['chr17'])
ticknums = [0, math.ceil(nvariants * 0.25), int(nvariants * 0.5), math.ceil(nvariants * 0.75), nvariants]
ticklabels = ['{:d}%\n({:d})'.format(int(tn / nvariants * 100), tn) for tn in ticknums]
_ = axis.set_xlabel('False calls')
_ = axis.set_yticks(ticknums)
_ = axis.set_yticklabels(ticklabels)
_ = axis.set_ylabel('True calls')
_ = axis.set_ylim((0, nvariants))
_ = axis.set_title(label)
if i == 0:
_ = axis.legend()
# _ = fig.tight_layout()
_ = plt.savefig('three-callers-all.png', dpi=300)
_ = plt.show()