In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import dice
from sklearn.metrics.pairwise import pairwise_distances


sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))

%matplotlib inline

In [2]:
COLORS=dict(zip(['BURST 95', 'BURST 98', 'BURST 95 Taxonomy', 'BURST 98 Taxonomy',
       'UTree', 'Bowtie2 95', 'Bowtie2 98', "Kraken", "Centrifuge"], sns.color_palette("husl", 9)))
DEPTHS = ['0001', '001', '01', '1', 'fulldepth']
COLUMNS = ['aligner', 'depth', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1', 'level', 'site', 'spearman', 'pearson', 'jaccard']

# DEPTHS_DICT
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ["tongue_dorsum", "stool", "supragingival_plaque", "right_retroauricular_crease", 
           "left_retroauricular_crease", "subgingival_plaque"]
CLASSES_MAP = dict(zip(CLASSES, ("oral", "stool", "oral", "skin", "skin", "oral")))

In [3]:
def save_plot(fig, pltname, artists=()):
    fig.savefig(os.path.join("..", "figures", "simulations_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')

In [4]:
# Get the fasta depths of each
from collections import defaultdict
DEPTHS_DICT = defaultdict(int)
files = glob.glob("../results/simulations_all_95/*1/count.seqs.txt")
for file in files:
    sequence_depth = !cat {file}
    depth = file.split('/')[-2].split('_')[-2]
    DEPTHS_DICT[depth] += int(sequence_depth[0])

In [5]:
DEPTHS_DICT


Out[5]:
defaultdict(int,
            {'0001': 3036,
             '001': 29978,
             '01': 300143,
             '1': 2999770,
             'fulldepth': 30000003})

In [6]:
# Load up the list of taxonomic profilers
taxonomic_profilers = ['UTree', 'BURST 98', 'BURST 98 Taxonomy', 'Bowtie2', 'Kraken', 'Centrifuge', 'BURST 95',  'BURST 95 Taxonomy', 'Bowtie 95']
# Select BURST capitalist

In [7]:
# Load up the ground truth taxatable
ground_truth_species = pd.read_csv("../results/simulations/taxatable.strain.species.normalized.txt", sep="\t", index_col=0)

In [8]:
head2tax = dict()
with open("../data/references/rep82/rep82.tax") as tax_inf:
    for line in tax_inf:
        line = line.rstrip().split('\t')
        head2tax[line[0]] = line[1]

In [9]:
# load up sequence to simulations
title2acc = dict()
title2tax = dict()
with open("../results/simulations/combined_seqs.fna") as fh:
    for line in fh:
        if line.startswith(">"):
            row = line.split()
            title = row[0][1:]
            accession = '_'.join(row[1].split('_')[:2])
            title2acc[title] = accession
            title2tax[title] = head2tax[accession]

In [10]:
def normalize_taxatable(df):
    # Features X Samples
    # Redistribute to median depth
    df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
    df_temp = df.apply(lambda x: x/x.sum(), axis=0)
    df = df[df_temp.apply(lambda x: x.mean(), axis=1) >= .002]
    df[df == 0] = .65
    df = df.apply(lambda x: x/x.sum(), axis=0)
    data = df.T.values
    data = clr(data)
    return pd.DataFrame(data, index=df.columns, columns=df.index)

def subset_taxatable(df):
    # Redistribute to median depth
    df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
    df_temp = df.apply(lambda x: x/x.sum(), axis=0)
    df = df[df_temp.apply(lambda x: x.mean(), axis=1) >= .002]
    return df

def normalize_two_taxatable(df_found, df_gold):
    # Features X Samples
    df_found = subset_taxatable(df_found)
    df_gold = subset_taxatable(df_gold)
    df_found = df_found.T
    df_found = df_found[[_ in df_gold.columns for _ in df_found.index]].T
    df_found.columns = [_ + "_found" for _ in df_found.columns]
#     df_found['group'] = ["found" for _ in range(df_found.shape[0])]
#     df_gold['group'] = ["gold" for _ in range(df_gold.shape[0])]
    df_joined = df_gold.join(df_found)
    df_joined = df_joined.fillna(0)
    df_joined[df_joined == 0] = .65
    df_joined = df_joined.apply(lambda x: x/x.sum(), axis=0)
    data = df_joined.T.values
    data = clr(data)
    df_clr =  pd.DataFrame(data, index=df_joined.columns, columns=df_joined.index)
    return df_clr[['found' in _ for _ in df_clr.index]], df_clr[['found' not in _ for _ in df_clr.index]]

In [11]:
GOLD_SPECIES = normalize_taxatable(ground_truth_species)

In [ ]:
# Load up the taxonomies for BURST
def fetch_burst_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95', '98']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.burst.b6".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                with open(infile) as fh:
                    for line in fh:
                        row = line.rstrip().split("\t")
                        accession_found = row[1]
                        accession_actual = title2acc[row[0]]
                        group = row[0].split('_')[0]
                        if accession_found == accession_actual:
                            f1_results[group]['tp'] += 1
                        else:
                            f1_results[group]['fp'] += 1
                folder = '/'.join(infile.split('/')[:-1])
                df = pd.read_csv(folder + "/taxatable.burst.species.normalized.txt", sep="\t", index_col=0)
                df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
                for group in f1_results.keys():
                    taxahit_found = set(df[df[group] > 5].index)
                    taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
                    jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
                    series_found = df_norm.T[group + "_found"]
                    series_actual = df_gold.T[group]
                    pearson = series_actual.corr(series_found, method='pearson')
                    spearman = series_actual.corr(series_found, method='spearman')
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['BURST ' + alignment, tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]

In [ ]:
df_burst =  pd.DataFrame(fetch_burst_results(), columns=COLUMNS)
df_burst.head()

In [ ]:
df_burst.tail()

In [ ]:
# Load up the taxonomies for BURST
def fetch_bursttaxa_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95', '98']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.burst.b6".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                with open(infile) as fh:
                    for line in fh:
                        row = line.rstrip().split("\t")
                        taxa_found = row[-1]
                        if 's__' in taxa_found:
                            taxa_found = ';'.join(taxa_found.split(';')[:7])
                            taxa_actual = title2tax[row[0]]
                            group = row[0].split('_')[0]
                            if taxa_actual.startswith(taxa_found):
                                f1_results[group]['tp'] += 1
                            else:
                                f1_results[group]['fp'] += 1
                folder = '/'.join(infile.split('/')[:-1])
                df = pd.read_csv(folder + "/taxatable.burst.taxonomy.species.txt", sep="\t", index_col=0)
                df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
                for group in f1_results.keys():
                    taxahit_found = set(df[df[group] > 5].index)
                    taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
                    jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
                    series_found = df_norm.T[group + "_found"]
                    series_actual = df_gold.T[group]
                    pearson = series_actual.corr(series_found, method='pearson')
                    spearman = series_actual.corr(series_found, method='spearman')
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['BURST ' + alignment + " Taxonomy", tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]

In [ ]:
df_burst_taxa =  pd.DataFrame(fetch_bursttaxa_results(), columns=COLUMNS)
df_burst_taxa.head()

In [ ]:
df_burst_taxa.tail()

In [ ]:
strain2accession = {}
with open("../data/references/rep82/rep82.tax") as inf:
    for line in inf:
        row = line.rstrip().split()
        strain2accession[row[1]] = row[0]

In [ ]:
# f1_results_utree = defaultdict(lambda: defaultdict(int))
# with open("../results/simulations_all_test/combined_seqs.fna_fulldepth_1/alignment.utree.tsv") as inf:
#     for line in inf:
#         row = line.rstrip().split("\t")
#         if row[1] in strain2accession:
#             accession_found = strain2accession[row[1]]
#             accession_actual = title2acc[row[0]]
#             group = row[0].split('_')[0]
#             if accession_found == accession_actual:
#                 f1_results_utree[group]['tp'] += 1
#             else:
#                 f1_results_utree[group]['fp'] += 1

In [ ]:
# Load up the taxonomies for BURST
def fetch_utree_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.utree.tsv".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                with open(infile) as fh:
                    for line in fh:
                        row = line.rstrip().split("\t")
                        taxa_found = row[2]
                        if row[1] in strain2accession:
                            accession_found = strain2accession[row[1]]
                            accession_actual = title2acc[row[0]]
                            group = row[0].split('_')[0]
                            if accession_found == accession_actual:
                                f1_results[group]['tp'] += 1
                            else:
                                f1_results[group]['fp'] += 1
                folder = '/'.join(infile.split('/')[:-1])
                df = pd.read_csv(folder + "/taxatable.utree.species.normalized.txt", sep="\t", index_col=0)
                df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
                for group in f1_results.keys():
                    taxahit_found = set(df[df[group] > 5].index)
                    taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
                    jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
                    series_found = df_norm.T[group + "_found"]
                    series_actual = df_gold.T[group]
                    pearson = series_actual.corr(series_found, method='pearson')
                    spearman = series_actual.corr(series_found, method='spearman')
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['UTree', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]

In [ ]:
df_utree =  pd.DataFrame(fetch_utree_results(), columns=COLUMNS)
df_utree.head()

In [ ]:
df_utree.tail()

In [ ]:
from shogun.redistribute import Taxonomy
from shogun.utils import build_lca_map
from shogun.parsers import yield_alignments_from_sam_inf

tree = Taxonomy("../data/references/rep82/rep82.tax")

In [ ]:
# Load up the taxonomies for BURST
def fetch_bowtie2_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95', '98']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.bowtie2.sam".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                align_gen = yield_alignments_from_sam_inf(infile)
                lca_map = build_lca_map(align_gen, tree)
                for key, value in lca_map.items():
                    if value:
                        if 's__' in value:
                            taxa_found = ';'.join(value.split(';')[:7])
                            taxa_actual = title2tax[key]
                            group = key.split('_')[0]
                            if taxa_actual.startswith(taxa_found):
                                f1_results[group]['tp'] += 1
                            else:
                                f1_results[group]['fp'] += 1
                folder = '/'.join(infile.split('/')[:-1])
                df = pd.read_csv(folder + "/taxatable.bowtie2.species.normalized.txt", sep="\t", index_col=0)
                df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
                for group in f1_results.keys():
                    taxahit_found = set(df[df[group] > 5].index)
                    taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
                    jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
                    series_found = df_norm.T[group + "_found"]
                    series_actual = df_gold.T[group]
                    pearson = series_actual.corr(series_found, method='pearson')
                    spearman = series_actual.corr(series_found, method='spearman')
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['Bowtie2 ' + alignment, tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]

In [ ]:
df_bowtie2 =  pd.DataFrame(fetch_bowtie2_results(), columns=COLUMNS)
df_bowtie2.head()

In [ ]:
df_bowtie2.tail()

In [ ]:
df_all = pd.concat([df_burst, df_burst_taxa, df_utree, df_bowtie2])
df_all['depth_log10'] = np.round(np.log10(df_all['depth']))
df_all.to_csv("../figures/simulations_all.txt")

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="recall", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Recall").set_xlabels("Depth (log10)"))
pltname = "recall"
save_plot(g, pltname)

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="spearman", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Spearman").set_xlabels("Depth (log10)"))
pltname = "spearman"
save_plot(g, pltname)

In [ ]:
df_all['aligner'].unique()

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="jaccard", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Jaccard").set_xlabels("Depth (log10)"))
pltname = "jaccard"
save_plot(g, pltname)

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="f1", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("F1 Micro").set_xlabels("Depth (log10)"))
pltname = "f1"
save_plot(g, pltname)

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="precision", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Precision").set_xlabels("Depth (log10)"))
pltname = "precision"
save_plot(g, pltname)

In [ ]:
# Load up the taxonomies for BURST
def fetch_centrifuge_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/centrifuge.classification.txt".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                with open(infile) as inf:
                    next(inf)
                    for line in inf:
                        row = line.rstrip().split("\t")
                        if row[-1] == "1":
                            accession_found = row[1]
                            accession_actual = title2acc[row[0]]
                            group = row[0].split('_')[0]
                            if accession_found == accession_actual:
                                f1_results[group]['tp'] += 1
                            else:
                                f1_results[group]['fp'] += 1
                for group in f1_results.keys():
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['Centrifuge', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, 0, 0, 0]

In [ ]:
df_centrifuge = pd.DataFrame(fetch_centrifuge_results(), columns=COLUMNS)
df_centrifuge.head()

In [ ]:
taxid2accession = dict()
with open("../results/indices/centrifuge_rep82.map") as fh:
    for line in fh:
        row = line.rstrip().split('\t')
        taxid2accession[row[1]] = row[0]

In [ ]:
# Load up the taxonomies for BURST
def fetch_kraken_results():
    # Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
    # Load up the tp/fp
    for depth in DEPTHS:
        for alignment in ['95']:
            file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/kraken.report.txt".format(depth=depth, alignment=alignment)
            infiles = glob.glob(file_str)
            for infile in infiles:
                f1_results = defaultdict(lambda: defaultdict(int))
                with open(infile) as inf:
                    for line in inf:
                        row = line.rstrip().split("\t")
                        if row[2] in taxid2accession and row[0] == "C":
                            accession_found = taxid2accession[row[2]]
                            accession_actual = title2acc[row[1]]
                            group = row[1].split('_')[0]
                            if accession_found == accession_actual:
                                f1_results[group]['tp'] += 1
                            else:
                                f1_results[group]['fp'] += 1
                for group in f1_results.keys():
                    tp = f1_results[group]['tp']
                    fp = f1_results[group]['fp']
                    fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
                    precision = tp/float(tp+fp)
                    recall = tp/float(tp+fn)
                    f1_score = (precision+recall)/2.
                    level = 'species'
                    site = group
#                     df_temp = pd.DataFrame([series_found, series_actual]).T
#                     df_temp.columns = ['found', 'actual']
#                     df_temp = df_temp.fillna(0)
#                     d = dice(df_temp['found'], df_temp['actual'])
                    yield ['Kraken', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, 0, 0, 0]

In [ ]:
df_kraken = pd.DataFrame(fetch_kraken_results(), columns=COLUMNS)
df_kraken.head()

In [ ]:
df_all = pd.concat([df_burst, df_burst_taxa, df_utree, df_bowtie2, df_kraken, df_centrifuge])
df_all['depth_log10'] = np.round(np.log10(df_all['depth']))
df_all.to_csv("../figures/simulations_kraken_centrifuge.txt")

In [ ]:
df_kraken.head()

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="f1", hue="aligner", col="site", data=df_all)
(g.set_ylabels("F1 Micro").set_xlabels("Depth (log10)"))
pltname = "f1_kraken_centrifuge"
save_plot(g, pltname)

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="precision", hue="aligner", col="site", data=df_all)
(g.set_ylabels("Precision").set_xlabels("Depth (log10)"))
pltname = "precision_kraken_centrifuge"
save_plot(g, pltname)

In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="recall", hue="aligner", col="site", data=df_all)
(g.set_ylabels("Recall").set_xlabels("Depth (log10)"))
pltname = "recall_kraken_centrifuge"
save_plot(g, pltname)