In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
COLORS=dict(zip(["burst", "utree", "centrifuge", "kraken", "bowtie2", 'burst-capitalist', 'burst-lca', 3, 4, 5], sns.color_palette("colorblind", 10)))
sns.palplot(sns.color_palette("colorblind", 10))

%matplotlib inline

In [2]:
def save_plot(fig, pltname, artists=()):
    fig.savefig(os.path.join("..", "figures", "single_strains_shogun_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')

def remove_plasmids(df):
    level = [_ if not "Plasmid" in _ else _.replace("Plasmid", "") for _ in df.index]
    return df.groupby(level).sum(axis=0)

In [3]:
# load up the files
bowtie2_species_files = glob.glob("../results/single_strains_shogun/**/taxatable.bowtie2.species.txt")
utree_species_files = glob.glob("../results/single_strains_shogun/**/taxatable.utree.species.txt")
burst_species_files = glob.glob("../results/single_strains_shogun/**/taxatable.burst.species.txt")
burst_taxonomy_species_files = glob.glob("../results/single_strains_shogun/**/taxatable.burst.taxonomy.species.txt")

In [4]:
# tax = pd.read_csv("../results/single_strains_shogun/ecoli_001/taxatable.burst.taxonomy.txt", sep="\t", index_col=0)
# tax_redis = pd.read_csv("../results/single_strains_shogun/ecoli_001/burst_taxatable.taxonomy.species.txt", sep="\t", index_col=0)
# cap = pd.read_csv("../results/single_strains_shogun/ecoli_001/taxatable.burst.capitalist.txt", sep="\t", index_col=0)

In [5]:
# tax

In [6]:
# tax_redis

In [7]:
# cap

In [ ]:


In [8]:
# Get the fasta depths of each
strains = ['ecoli', 'kpneumoniae', 'saureus']
strain_depths = {}

for strain in strains:
    files = glob.glob("../results/single_strains_shogun/**/count.seqs.txt")
    for file in files:
        sequence_depth = !grep "^>" {file} | wc -l
        name = file.split('/')[-2]
#         print(name)
        strain_depths[name] =  int(sequence_depth[0])

# ecoli_depth = !wc -l ../data/single_strain/ecoli_analysis/ecoli_combined_seqs/combined_seqs_fulldepth.fna
# kpneumo_depth = !wc -l ../data/single_strain/kpneumoniae_analysis/kpneumo_combined_seqs/combined_seqs_fulldepth.fna
# saureus_depth = !wc -l ../data/single_strain/saureus_analysis/saureus_combined_seqs/combined_seqs_fulldepth.fna

In [9]:
species_tp = {}
species_tp['saureus'] = 'k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;s__Staphylococcus_aureus'
species_tp['kpneumoniae'] = 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella_pneumoniae'
species_tp['ecoli'] = 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli'

columns = ["strain", "fasta_depth", "log_fasta_depth", "taxa_hits", "tp", "fp", "aligner", "hit_rate", "precision", "recall"]
rows = []
for aligner, filelist in zip(("bowtie2", "utree", "burst-capitalist", "burst-lca"), (bowtie2_species_files, utree_species_files, burst_species_files, burst_taxonomy_species_files)):
    for file in filelist:
        strain = file.split('/')[-2].split('_')[0]
        name = file.split('/')[-2]
        fasta_depth = strain_depths[name]
#         if depth == 'fulldepth':
#             fasta_depth = int(strain_depths["{strain}_fulldepth".format(strain=strain)])
#         else:
#             fasta_depth = int((10**-len(depth))*strain_depths["{strain}_fulldepth".format(strain=strain)])
#         fasta_depth = strain_depths["{strain}_{depth}".format(strain=strain, depth=depth)]l
        df_tmp = pd.read_csv(file, sep="\t", index_col=0)
        if df_tmp.shape[1] > 1:
            df_2 = pd.DataFrame(df_tmp.sum(axis=1), index=df_tmp.index)
            df_tmp = df_2
        df_tmp = remove_plasmids(df_tmp)
        taxa_hits = df_tmp.sum()[0]
        correct_hit = species_tp[strain]
        tp = df_tmp.T[correct_hit].values[0]
        fp = taxa_hits - tp
        rows.append([strain, fasta_depth, int(np.log10(fasta_depth)), taxa_hits, tp, fp, aligner, taxa_hits/fasta_depth, tp/taxa_hits, tp/fasta_depth])

In [10]:
# # coverage_files = glob.glob("../results/single_strains_shogun/**/coverage.species.txt")

# # for file in coverage_files:
# #     df = pd.read_csv(file, sep='\t', header=0, index_col=0)
# #     df['normalized_hits'] = df['hits_in_clade']*(df['unique_counts_of_clade']/df['median_genome_size'])
# #     df['relative_abundance'] = df['normalized_hits']/(df['normalized_hits'].sum())
# #     names = df[df['relative_abundance']*df['percent_of_genome_covered'] > .0001]
# #     print(names)
# #     break

# species_tp = {}
# species_tp['saureus'] = 'k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;s__Staphylococcus_aureus'
# species_tp['kpneumoniae'] = 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella_pneumoniae'
# species_tp['ecoli'] = 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli'

# columns = ["strain", "fasta_depth", "log_fasta_depth", "taxa_hits", "tp", "fp", "aligner", "hit_rate", "precision", "recall"]
# rows = []
# for aligner, filelist in zip(("bowtie2", "utree", "burst-capitalist", "burst-lca"), (bowtie2_species_files, utree_species_files, burst_species_files, burst_taxonomy_species_files)):
#     for file in filelist:
#         strain = file.split('/')[-2].split('_')[0]
#         name = file.split('/')[-2]
#         fasta_depth = strain_depths[name]
# #         if depth == 'fulldepth':
# #             fasta_depth = int(strain_depths["{strain}_fulldepth".format(strain=strain)])
# #         else:
# #             fasta_depth = int((10**-len(depth))*strain_depths["{strain}_fulldepth".format(strain=strain)])
# #         fasta_depth = strain_depths["{strain}_{depth}".format(strain=strain, depth=depth)]l
#         df_tmp = pd.read_csv(file, sep="\t", index_col=0)
#         if df_tmp.shape[1] > 1:
#             df_2 = pd.DataFrame(df_tmp.sum(axis=1), index=df_tmp.index)
#             df_tmp = df_2
#         splits_file = file.split("/")
#         splits_file[-1] = "coverage.species.txt"
#         df = pd.read_csv("/".join(splits_file), sep='\t', header=0, index_col=0)
#         df['normalized_hits'] = df['hits_in_clade']
#         df['relative_abundance'] = df['normalized_hits']/(df['normalized_hits'].sum())
#         names = df[df['relative_abundance']*df['percent_of_genome_covered'] > .0001].index
#         df_tmp = df_tmp.loc[names]
# #         df_tmp = remove_plasmids(df_tmp)
#         taxa_hits = df_tmp.sum()[0]
#         correct_hit = species_tp[strain]
#         tp = df_tmp.T[correct_hit].values[0]
#         fp = taxa_hits - tp
#         rows.append([strain, fasta_depth, int(np.log10(fasta_depth)), taxa_hits, tp, fp, aligner, taxa_hits/fasta_depth, tp/taxa_hits, tp/fasta_depth])

In [11]:
df_summary = pd.DataFrame(rows, columns=columns)
# df_summary["log_fasta_depth"] = np.floor(np.log10(df_summary["fasta_depth"]))
# df_summary['log_fasta_depth']
df_summary.to_csv("../figures/single_strains_shogun_table.txt", sep="\t")
df_summary


Out[11]:
strain fasta_depth log_fasta_depth taxa_hits tp fp aligner hit_rate precision recall
0 kpneumoniae 316642 5 65786 48370 17416 bowtie2 0.207761 0.735263 0.152759
1 ecoli 91 1 89 89 0 bowtie2 0.978022 1.000000 0.978022
2 kpneumoniae 3199 3 721 548 173 bowtie2 0.225383 0.760055 0.171304
3 ecoli 105 2 104 104 0 bowtie2 0.990476 1.000000 0.990476
4 saureus 265531 5 187169 177358 9811 bowtie2 0.704886 0.947582 0.667937
5 kpneumoniae 3102 3 644 482 162 bowtie2 0.207608 0.748447 0.155384
6 ecoli 9109 3 8904 8892 12 bowtie2 0.977495 0.998652 0.976177
7 kpneumoniae 315638 5 65637 48388 17249 bowtie2 0.207950 0.737206 0.153302
8 ecoli 91918 4 89910 89636 274 bowtie2 0.978154 0.996953 0.975174
9 ecoli 91859 4 89809 89451 358 bowtie2 0.977683 0.996014 0.973786
10 kpneumoniae 316 2 67 57 10 bowtie2 0.212025 0.850746 0.180380
11 ecoli 9241 3 9043 8972 71 bowtie2 0.978574 0.992149 0.970891
12 saureus 282 2 192 180 12 bowtie2 0.680851 0.937500 0.638298
13 kpneumoniae 31771 4 6433 4756 1677 bowtie2 0.202480 0.739313 0.149696
14 saureus 26657 4 18901 17860 1041 bowtie2 0.709045 0.944924 0.669993
15 saureus 2651 3 1877 1756 121 bowtie2 0.708035 0.935535 0.662392
16 kpneumoniae 3138 3 660 506 154 bowtie2 0.210325 0.766667 0.161249
17 saureus 26378 4 18602 17579 1023 bowtie2 0.705209 0.945006 0.666427
18 ecoli 9206 3 9008 8992 16 bowtie2 0.978492 0.998224 0.976754
19 saureus 26621 4 18710 17694 1016 bowtie2 0.702829 0.945697 0.664663
20 ecoli 923 2 909 906 3 bowtie2 0.984832 0.996700 0.981582
21 ecoli 9111 3 8882 8861 21 bowtie2 0.974866 0.997636 0.972561
22 kpneumoniae 315865 5 65773 48330 17443 bowtie2 0.208231 0.734800 0.153008
23 ecoli 909 2 884 882 2 bowtie2 0.972497 0.997738 0.970297
24 ecoli 9158 3 8948 8931 17 bowtie2 0.977069 0.998100 0.975213
25 kpneumoniae 3160251 6 657772 484453 173319 bowtie2 0.208139 0.736506 0.153296
26 kpneumoniae 3043 3 637 486 151 bowtie2 0.209333 0.762951 0.159711
27 ecoli 91786 4 89725 89347 378 bowtie2 0.977546 0.995787 0.973427
28 saureus 269 2 190 179 11 bowtie2 0.706320 0.942105 0.665428
29 saureus 2551 3 1804 1704 100 bowtie2 0.707174 0.944568 0.667973
... ... ... ... ... ... ... ... ... ... ...
222 kpneumoniae 31613 4 6750 4999 1751 burst-lca 0.213520 0.740593 0.158131
223 saureus 266053 5 191138 177575 13563 burst-lca 0.718421 0.929041 0.667442
224 saureus 265094 5 190090 176504 13586 burst-lca 0.717066 0.928529 0.665817
225 ecoli 76 1 76 76 0 burst-lca 1.000000 1.000000 1.000000
226 ecoli 86 1 86 86 0 burst-lca 1.000000 1.000000 1.000000
227 saureus 266112 5 191322 177797 13525 burst-lca 0.718953 0.929308 0.668128
228 saureus 26760 4 19211 17820 1391 burst-lca 0.717900 0.927594 0.665919
229 saureus 2656 3 1918 1799 119 burst-lca 0.722139 0.937956 0.677334
230 kpneumoniae 31410 4 6951 5144 1807 burst-lca 0.221299 0.740037 0.163770
231 kpneumoniae 327 2 74 63 11 burst-lca 0.226300 0.851351 0.192661
232 ecoli 92073 4 90623 89051 1572 burst-lca 0.984252 0.982653 0.967178
233 kpneumoniae 3187 3 671 502 169 burst-lca 0.210543 0.748137 0.157515
234 ecoli 951 2 936 916 20 burst-lca 0.984227 0.978632 0.963197
235 saureus 250 2 175 162 13 burst-lca 0.700000 0.925714 0.648000
236 kpneumoniae 303 2 61 46 15 burst-lca 0.201320 0.754098 0.151815
237 ecoli 918660 5 903675 887571 16104 burst-lca 0.983688 0.982179 0.966158
238 kpneumoniae 31553 4 6744 5098 1646 burst-lca 0.213736 0.755931 0.161569
239 saureus 266266 5 191063 177495 13568 burst-lca 0.717564 0.928987 0.666608
240 saureus 26555 4 19052 17721 1331 burst-lca 0.717454 0.930139 0.667332
241 ecoli 84 1 83 83 0 burst-lca 0.988095 1.000000 0.988095
242 saureus 301 2 218 209 9 burst-lca 0.724252 0.958716 0.694352
243 kpneumoniae 31846 4 6939 5125 1814 burst-lca 0.217892 0.738579 0.160931
244 kpneumoniae 312 2 64 47 17 burst-lca 0.205128 0.734375 0.150641
245 kpneumoniae 329 2 83 63 20 burst-lca 0.252280 0.759036 0.191489
246 ecoli 952 2 938 928 10 burst-lca 0.985294 0.989339 0.974790
247 saureus 271 2 199 182 17 burst-lca 0.734317 0.914573 0.671587
248 ecoli 91939 4 90431 88874 1557 burst-lca 0.983598 0.982782 0.966663
249 kpneumoniae 315975 5 67531 49968 17563 burst-lca 0.213723 0.739927 0.158139
250 saureus 2659611 6 1911852 1775319 136533 burst-lca 0.718846 0.928586 0.667511
251 saureus 2576 3 1893 1789 104 burst-lca 0.734860 0.945061 0.694488

252 rows × 10 columns


In [18]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: hit rate
# x-axis: sequencing depth

for strain in strains:
    fig, ax = plt.subplots()
    g = sns.pointplot(x="log_fasta_depth", y="hit_rate", hue="aligner", data=df_summary[df_summary['strain'] == strain], palette=COLORS, alpha=.6)

    plt.title("Hit Rate {strain}".format(strain=strain))
    plt.ylabel("Hit Rate")
    plt.xlabel("Sequencing Depth (log10)")
    g.set(xticks=np.arange(5), ylim=(0.,1.05))
    pltname = "hit_rate_{strain}".format(strain=strain)
    save_plot(fig, pltname)



In [13]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="log_fasta_depth", y="recall", hue="aligner", col="strain", data=df_summary, palette=COLORS)
(g.set_ylabels("Recall").set_xlabels("Depth (log10)"))
pltname = "recall"
save_plot(g, pltname)



In [14]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Precision
# x-axis: N (workers)

g = sns.factorplot(x="log_fasta_depth", y="precision", hue="aligner", col="strain", data=df_summary, sharex=True, palette=COLORS)
(g.set_ylabels("Precision").set_xlabels("Depth (log10)"))
pltname = "precision"
save_plot(g, pltname)



In [15]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: hit rate
# x-axis: N (workers)

g = sns.factorplot(x="log_fasta_depth", y="hit_rate", hue="aligner", col="strain", data=df_summary, sharex=True, palette=COLORS)
(g.set_ylabels("Hit Rate").set_xlabels("Depth (log10)"))
pltname = "hit_rate"
save_plot(g, pltname)



In [16]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: True Positives
# x-axis: N (workers)

df_summary['log10_tp'] = np.log10(df_summary["tp"] + 1)

g = sns.factorplot(x="log_fasta_depth", y="log10_tp", hue="aligner", col="strain", data=df_summary, sharex=True, palette=COLORS)
(g.set_ylabels("True Positives (log10)").set_xlabels("Depth (log10)"))
pltname = "log10_tp"
save_plot(g, pltname)



In [17]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: False Positives
# x-axis: N (workers)

df_summary['log10_fp'] = np.log10(df_summary["fp"] + 1)

g = sns.factorplot(x="log_fasta_depth", y="log10_fp", hue="aligner", col="strain", data=df_summary, sharex=True, palette=COLORS)
(g.set_ylabels("False Positives (log10)").set_xlabels("Depth (log10)"))
pltname = "log10_fp"
save_plot(g, pltname)



In [ ]:


In [ ]: