In [2]:
%matplotlib inline
import re
import sys
import math
import matplotlib
import seaborn
import numpy
from matplotlib import pyplot as plt
from collections import defaultdict

from evalutils import IntervalForest, populate_index_from_bed, compact
from evalutils import subset_variants_bed, load_kevlar_vcf, load_gatk_mvf, load_triodenovo_vcf
import kevlar

seaborn.set_context({'figure.figsize': (18, 12)})
matplotlib.rcParams['axes.labelsize'] = 16
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14

In [3]:
def roc(calls, index, subset=None, delta=10, fmt='vcf', limit=120):
    ncorrect = 0
    num_true_calls_per_false_call = list()
    counted = 0
    for varcall in calls:
        if fmt == 'vcf':
            valid = index.query(varcall.seqid, varcall.position, delta=delta) != set()
            if valid and subset:
                if subset.query(varcall.seqid, varcall.position, delta=delta) == set():
                    continue
        elif fmt == 'mvf':
            callindex, call = varcall
            valid = index.query(call['CHROM'], call['POS'], delta=delta) != set()
        else:
            raise ValueError('unknown format "'+ fmt +'"')
        counted += 1
        if counted > limit:
            break
        if valid:
            ncorrect += 1
            continue
        num_true_calls_per_false_call.append(ncorrect)
    if len(num_true_calls_per_false_call) == 0 or ncorrect > num_true_calls_per_false_call[-1]:
        num_true_calls_per_false_call.append(ncorrect)
    return num_true_calls_per_false_call

In [4]:
delta = 10
with kevlar.open('14153-refr-calls-denovodb-dedup.bed', 'r') as bedin:
    index = populate_index_from_bed(bedin)
with kevlar.open('14153-refr-calls-denovodb-validated-dedup.bed', 'r') as bedin:
    validated_index = populate_index_from_bed(bedin)
kevlarcalls = list(load_kevlar_vcf('calls.likescoremin50.varfilt.scored.vcf.gz', index, delta=delta))

In [5]:
all_limit = 105
truecalls = roc(kevlarcalls, index, delta=delta, fmt='vcf', limit=all_limit)
truecalls_validated = roc(kevlarcalls, index, subset=validated_index, delta=delta, fmt='vcf', limit=17)
fig, ax1 = plt.subplots()

ax1.plot(range(len(truecalls_validated)), truecalls_validated, color='blue')
ax1.plot(range(len(truecalls_validated)), truecalls_validated, '^', color='blue', markevery=1)
ax1.set_xlabel('Incongruent calls', fontsize=16)
ax1.set_ylim((0, 14))
yt = [0, 2, 4, 6, 8, 10, 12, 14]
ax1.set_yticks(yt)
ax1.set_yticklabels(['{:d}%\n({:d})'.format(round(t / max(yt) * 100), t) for t in yt])
ax1.set_ylabel('Congruent calls (validated, n=14)', fontsize=16, color='blue')
ax1.tick_params('y', colors='blue')

ax2 = ax1.twinx()
ax2.plot(range(len(truecalls)), truecalls, color='red')
ax2.plot(range(len(truecalls)), truecalls, 'o', color='red', markevery=1)
ax2.set_ylim((0, all_limit))
ax2.set_ylabel('Congruent calls (all)', fontsize=16, color='red')
ax2.tick_params('y', colors='red')

plt.savefig('kevlar-vs-denovodb-validated-and-all.pdf', dpi=300)



In [ ]: