In [2]:
%matplotlib inline
import re
import sys
import math
import matplotlib
import seaborn
import numpy
from matplotlib import pyplot as plt
from collections import defaultdict

from evalutils import IntervalForest, populate_index_from_simulation, compact
from evalutils import subset_variants, load_kevlar_vcf, load_gatk_mvf, load_triodenovo_vcf
import kevlar

seaborn.set_context({'figure.figsize': (24, 12)})
matplotlib.rcParams['axes.labelsize'] = 16
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14

In [3]:
def roc(calls, index, delta=10, fmt='vcf'):
    ncorrect = 0
    num_true_calls_per_false_call = list()
    for varcall in calls:
        if fmt == 'vcf':
            valid = index.query(varcall.seqid, varcall.position, delta=delta) != set()
        elif fmt == 'mvf':
            callindex, call = varcall
            valid = index.query(call['CHROM'], call['POS'], delta=delta) != set()
        else:
            raise ValueError('unknown format "'+ fmt +'"')
        if valid:
            ncorrect += 1
            continue
        num_true_calls_per_false_call.append(ncorrect)
    if len(num_true_calls_per_false_call) == 0 or ncorrect > num_true_calls_per_false_call[-1]:
        num_true_calls_per_false_call.append(ncorrect)
    return num_true_calls_per_false_call

In [4]:
def doplot(axis, data, color, label, linestyle, symbol):
    if len(data) == 1:
        axis.plot(range(len(data)), data, symbol, color=color, label=label)
    else:
        axis.plot(range(len(data)), data, color=color, linestyle=linestyle, label=label)

In [5]:
delta = 10
for cov1, cov2 in (('10', '20'), ('30', '50')):
    categories = [
        ('SNV', None, None, 'SNVs'),
        ('INDEL', 1, 10, 'INDELs 1-10bp'),
        ('INDEL', 11, 100, 'INDELs 11-100bp'),
        ('INDEL', 101, 200, 'INDELs 101-200bp'),
        ('INDEL', 201, 300, 'INDELs 201-300bp'),
        ('INDEL', 301, 400, 'INDELs 301-400bp'),
    ]
    fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
    axes = (ax11, ax12, ax13, ax21, ax22, ax23)
    seaborn.set_context({'figure.figsize': (24, 12)})
    
    for i, (category, axis) in enumerate(zip(categories, axes)):
        vartype, minlength, maxlength, label = category
        with kevlar.open('SimulatedVariants_chr17_hg38.tsv.gz', 'r') as instream:
            variants = subset_variants(instream, vartype, minlength, maxlength)
            index = populate_index_from_simulation(variants, 'chr17')
        
            for cov, linestyle in zip((cov1, cov2), (':', '-')):
                kevlar_truecalls = roc(
                    load_kevlar_vcf(
                        'kevlar_calls_'+ cov +'x.vcf.gz', index, delta=delta,
                        vartype=vartype, minlength=minlength, maxlength=maxlength
                    ),
                    index, delta=delta, fmt='vcf'
                )
                gatk_truecalls = roc(
                    load_gatk_mvf(
                        'GATK_calls_'+ cov +'x.mvf.gz',
                        vartype=vartype, minlength=minlength, maxlength=maxlength
                    ).iterrows(),
                    index, delta=delta, fmt='mvf'
                )
                triodenovo_truecalls = roc(
                    load_triodenovo_vcf(
                        'triodenovo_calls_'+ cov +'x.vcf.gz', cov=cov,
                        vartype=vartype, minlength=minlength, maxlength=maxlength,
                    ),
                    index, delta=delta, fmt='vcf'
                )
                doplot(axis, kevlar_truecalls, 'red', 'kevlar ({}x)'.format(cov), linestyle, 'o')
                doplot(axis, gatk_truecalls, 'blue', 'GATK+PBT ({}x)'.format(cov), linestyle, 's')
                doplot(axis, triodenovo_truecalls, 'green', 'triodenovo ({}x)'.format(cov), linestyle, 'p')
            
            nvariants = len(index.trees['chr17'])
            ticknums = [0, math.ceil(nvariants * 0.25), int(nvariants * 0.5), math.ceil(nvariants * 0.75), nvariants]
            ticklabels = ['{:d}%\n({:d})'.format(int(tn / nvariants * 100), tn) for tn in ticknums]
            
            _ = axis.set_xlabel('False calls', fontsize=16)
            _ = axis.set_yticks(ticknums)
            _ = axis.set_yticklabels(ticklabels)
            _ = axis.set_ylabel('True calls', fontsize=16)
            _ = axis.set_ylim((0, nvariants))
            _ = axis.set_title(label, fontsize=18)
            if i == 0:
                _ = axis.legend(fontsize=14)
    
    _ = plt.subplots_adjust(hspace=0.3)
    _ = plt.savefig('three-callers-'+ cov +'x-combined.png', dpi=300)
    _ = plt.show()


WARNING: found 2 matches for CALLCLASS 1995
WARNING: found 2 matches for CALLCLASS 187
WARNING: found 2 matches for CALLCLASS 1182
WARNING: found 2 matches for CALLCLASS 1094

In [6]:
!./evaluate.py --do-all - -


WARNING: found 2 matches for CALLCLASS 1995
WARNING: found 2 matches for CALLCLASS 187
WARNING: found 2 matches for CALLCLASS 1182
WARNING: found 2 matches for CALLCLASS 1094
Caller Coverage          VarType Correct False Missing
    Kevlar       10              SNV      32   610     222
      GATK       10              SNV       1    41     253
TrioDenovo       10              SNV     122   103     132
    Kevlar       10     INDEL 1-10bp      35    21     243
      GATK       10     INDEL 1-10bp       0     4     278
TrioDenovo       10     INDEL 1-10bp     124    23     154
    Kevlar       10   INDEL 11-100bp      63    22     207
      GATK       10   INDEL 11-100bp       3     0     267
TrioDenovo       10   INDEL 11-100bp     117    26     153
    Kevlar       10  INDEL 101-200bp      52    13     220
      GATK       10  INDEL 101-200bp       1     0     271
TrioDenovo       10  INDEL 101-200bp      44     0     228
    Kevlar       10  INDEL 201-300bp      40     3     248
      GATK       10  INDEL 201-300bp       0     0     288
TrioDenovo       10  INDEL 201-300bp       1     0     287
    Kevlar       10  INDEL 301-400bp      38     0     246
      GATK       10  INDEL 301-400bp       0     0     284
TrioDenovo       10  INDEL 301-400bp       0     0     284
    Kevlar       20              SNV     177   201      77
      GATK       20              SNV     158    64      96
TrioDenovo       20              SNV     239    47      15
    Kevlar       20     INDEL 1-10bp     180     9      98
      GATK       20     INDEL 1-10bp     179    58      99
TrioDenovo       20     INDEL 1-10bp     257    55      21
    Kevlar       20   INDEL 11-100bp     212    17      58
      GATK       20   INDEL 11-100bp     179    76      91
TrioDenovo       20   INDEL 11-100bp     225    87      45
    Kevlar       20  INDEL 101-200bp     200     4      72
      GATK       20  INDEL 101-200bp      89     1     183
TrioDenovo       20  INDEL 101-200bp     117     1     155
    Kevlar       20  INDEL 201-300bp     209     6      79
      GATK       20  INDEL 201-300bp      12     0     276
TrioDenovo       20  INDEL 201-300bp      15     0     273
    Kevlar       20  INDEL 301-400bp     214     1      70
      GATK       20  INDEL 301-400bp       0     0     284
TrioDenovo       20  INDEL 301-400bp       0     0     284
    Kevlar       30              SNV     232    28      22
      GATK       30              SNV     247    86       7
TrioDenovo       30              SNV     250    48       4
    Kevlar       30     INDEL 1-10bp     240     3      38
      GATK       30     INDEL 1-10bp     256    62      22
TrioDenovo       30     INDEL 1-10bp     258    60      20
    Kevlar       30   INDEL 11-100bp     244     2      26
      GATK       30   INDEL 11-100bp     228   110      42
TrioDenovo       30   INDEL 11-100bp     228   109      42
    Kevlar       30  INDEL 101-200bp     252     1      20
      GATK       30  INDEL 101-200bp     129     8     143
TrioDenovo       30  INDEL 101-200bp     131     8     141
    Kevlar       30  INDEL 201-300bp     261     2      27
      GATK       30  INDEL 201-300bp      12     0     276
TrioDenovo       30  INDEL 201-300bp      13     0     275
    Kevlar       30  INDEL 301-400bp     253     4      31
      GATK       30  INDEL 301-400bp       0     0     284
TrioDenovo       30  INDEL 301-400bp       0     0     284
    Kevlar       50              SNV     242    16      12
      GATK       50              SNV     250   130       4
TrioDenovo       50              SNV     251    90       3
    Kevlar       50     INDEL 1-10bp     251     8      27
      GATK       50     INDEL 1-10bp     261    56      17
TrioDenovo       50     INDEL 1-10bp     261    62      17
    Kevlar       50   INDEL 11-100bp     247     3      23
      GATK       50   INDEL 11-100bp     240   108      30
TrioDenovo       50   INDEL 11-100bp     240   107      30
    Kevlar       50  INDEL 101-200bp     254     3      18
      GATK       50  INDEL 101-200bp     132     5     140
TrioDenovo       50  INDEL 101-200bp     132     5     140
    Kevlar       50  INDEL 201-300bp     262     3      26
      GATK       50  INDEL 201-300bp      21     0     267
TrioDenovo       50  INDEL 201-300bp      21     0     267
    Kevlar       50  INDEL 301-400bp     257     6      27
      GATK       50  INDEL 301-400bp       0     0     284
TrioDenovo       50  INDEL 301-400bp       0     0     284

In [7]:
delta = 10

categories = [
    ('SNV', None, None, 'SNVs'),
    ('INDEL', 1, 10, 'INDELs 1-10bp'),
    ('INDEL', 11, 100, 'INDELs 11-100bp'),
    ('INDEL', 101, 200, 'INDELs 101-200bp'),
    ('INDEL', 201, 300, 'INDELs 201-300bp'),
    ('INDEL', 301, 400, 'INDELs 301-400bp'),
]
coverages = ('10', '20', '30', '50')
linestyles = (':', '-.', '--', '-')

fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
axes = (ax11, ax12, ax13, ax21, ax22, ax23)
seaborn.set_context({'figure.figsize': (24, 12)})

for i, (category, axis) in enumerate(zip(categories, axes)):
    vartype, minlength, maxlength, label = category
    with kevlar.open('SimulatedVariants_chr17_hg38.tsv.gz', 'r') as instream:
        variants = subset_variants(instream, vartype, minlength, maxlength)
        index = populate_index_from_simulation(variants, 'chr17')
    
        for cov, linestyle in zip(coverages, linestyles):
            kevlar_truecalls = roc(
                load_kevlar_vcf(
                    'kevlar_calls_'+ cov +'x.vcf.gz', index, delta=delta,
                    vartype=vartype, minlength=minlength, maxlength=maxlength
                ),
                index, delta=delta, fmt='vcf'
            )
            gatk_truecalls = roc(
                load_gatk_mvf(
                    'GATK_calls_'+ cov +'x.mvf.gz',
                    vartype=vartype, minlength=minlength, maxlength=maxlength
                ).iterrows(),
                index, delta=delta, fmt='mvf'
            )
            triodenovo_truecalls = roc(
                load_triodenovo_vcf(
                    'triodenovo_calls_'+ cov +'x.vcf.gz', cov=cov,
                    vartype=vartype, minlength=minlength, maxlength=maxlength,
                ),
                index, delta=delta, fmt='vcf'
            )
            doplot(axis, kevlar_truecalls, 'red', 'kevlar ({}x)'.format(cov), linestyle, 'o')
            doplot(axis, gatk_truecalls, 'blue', 'GATK+PBT ({}x)'.format(cov), linestyle, 's')
            doplot(axis, triodenovo_truecalls, 'green', 'triodenovo ({}x)'.format(cov), linestyle, 'p')
        
        nvariants = len(index.trees['chr17'])
        ticknums = [0, math.ceil(nvariants * 0.25), int(nvariants * 0.5), math.ceil(nvariants * 0.75), nvariants]
        ticklabels = ['{:d}%\n({:d})'.format(int(tn / nvariants * 100), tn) for tn in ticknums]
        
        _ = axis.set_xlabel('False calls')
        _ = axis.set_yticks(ticknums)
        _ = axis.set_yticklabels(ticklabels)
        _ = axis.set_ylabel('True calls')
        _ = axis.set_ylim((0, nvariants))
        _ = axis.set_title(label)
        if i == 0:
            _ = axis.legend()
    
# _ = fig.tight_layout()
_ = plt.savefig('three-callers-all.png', dpi=300)
_ = plt.show()


WARNING: found 2 matches for CALLCLASS 1995
WARNING: found 2 matches for CALLCLASS 1182
WARNING: found 2 matches for CALLCLASS 1094
WARNING: found 2 matches for CALLCLASS 187