In [2]:
%matplotlib inline
import re
import sys
import math
import matplotlib
import seaborn
import numpy
from matplotlib import pyplot as plt
from collections import defaultdict
from evalutils import IntervalForest, populate_index_from_bed, compact
from evalutils import subset_variants_bed, load_kevlar_vcf, load_gatk_mvf, load_triodenovo_vcf
import kevlar
seaborn.set_context({'figure.figsize': (18, 12)})
matplotlib.rcParams['axes.labelsize'] = 16
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14
In [3]:
def roc(calls, index, subset=None, delta=10, fmt='vcf', limit=120):
ncorrect = 0
num_true_calls_per_false_call = list()
counted = 0
for varcall in calls:
if fmt == 'vcf':
valid = index.query(varcall.seqid, varcall.position, delta=delta) != set()
if valid and subset:
if subset.query(varcall.seqid, varcall.position, delta=delta) == set():
continue
elif fmt == 'mvf':
callindex, call = varcall
valid = index.query(call['CHROM'], call['POS'], delta=delta) != set()
else:
raise ValueError('unknown format "'+ fmt +'"')
counted += 1
if counted > limit:
break
if valid:
ncorrect += 1
continue
num_true_calls_per_false_call.append(ncorrect)
if len(num_true_calls_per_false_call) == 0 or ncorrect > num_true_calls_per_false_call[-1]:
num_true_calls_per_false_call.append(ncorrect)
return num_true_calls_per_false_call
In [4]:
delta = 10
with kevlar.open('14153-refr-calls-denovodb-dedup.bed', 'r') as bedin:
index = populate_index_from_bed(bedin)
with kevlar.open('14153-refr-calls-denovodb-validated-dedup.bed', 'r') as bedin:
validated_index = populate_index_from_bed(bedin)
kevlarcalls = list(load_kevlar_vcf('calls.likescoremin50.varfilt.scored.vcf.gz', index, delta=delta))
In [5]:
all_limit = 105
truecalls = roc(kevlarcalls, index, delta=delta, fmt='vcf', limit=all_limit)
truecalls_validated = roc(kevlarcalls, index, subset=validated_index, delta=delta, fmt='vcf', limit=17)
fig, ax1 = plt.subplots()
ax1.plot(range(len(truecalls_validated)), truecalls_validated, color='blue')
ax1.plot(range(len(truecalls_validated)), truecalls_validated, '^', color='blue', markevery=1)
ax1.set_xlabel('Incongruent calls', fontsize=16)
ax1.set_ylim((0, 14))
yt = [0, 2, 4, 6, 8, 10, 12, 14]
ax1.set_yticks(yt)
ax1.set_yticklabels(['{:d}%\n({:d})'.format(round(t / max(yt) * 100), t) for t in yt])
ax1.set_ylabel('Congruent calls (validated, n=14)', fontsize=16, color='blue')
ax1.tick_params('y', colors='blue')
ax2 = ax1.twinx()
ax2.plot(range(len(truecalls)), truecalls, color='red')
ax2.plot(range(len(truecalls)), truecalls, 'o', color='red', markevery=1)
ax2.set_ylim((0, all_limit))
ax2.set_ylabel('Congruent calls (all)', fontsize=16, color='red')
ax2.tick_params('y', colors='red')
plt.savefig('kevlar-vs-denovodb-validated-and-all.pdf', dpi=300)
In [ ]: