In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import dice
from sklearn.metrics.pairwise import pairwise_distances
sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))
%matplotlib inline
In [2]:
COLORS=dict(zip(['BURST 95', 'BURST 98', 'BURST 95 Taxonomy', 'BURST 98 Taxonomy',
'UTree', 'Bowtie2 95', 'Bowtie2 98', "Kraken", "Centrifuge"], sns.color_palette("husl", 9)))
DEPTHS = ['0001', '001', '01', '1', 'fulldepth']
COLUMNS = ['aligner', 'depth', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1', 'level', 'site', 'spearman', 'pearson', 'jaccard']
# DEPTHS_DICT
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ["tongue_dorsum", "stool", "supragingival_plaque", "right_retroauricular_crease",
"left_retroauricular_crease", "subgingival_plaque"]
CLASSES_MAP = dict(zip(CLASSES, ("oral", "stool", "oral", "skin", "skin", "oral")))
In [3]:
def save_plot(fig, pltname, artists=()):
fig.savefig(os.path.join("..", "figures", "simulations_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')
In [4]:
# Get the fasta depths of each
from collections import defaultdict
DEPTHS_DICT = defaultdict(int)
files = glob.glob("../results/simulations_all_95/*1/count.seqs.txt")
for file in files:
sequence_depth = !cat {file}
depth = file.split('/')[-2].split('_')[-2]
DEPTHS_DICT[depth] += int(sequence_depth[0])
In [5]:
DEPTHS_DICT
Out[5]:
In [6]:
# Load up the list of taxonomic profilers
taxonomic_profilers = ['UTree', 'BURST 98', 'BURST 98 Taxonomy', 'Bowtie2', 'Kraken', 'Centrifuge', 'BURST 95', 'BURST 95 Taxonomy', 'Bowtie 95']
# Select BURST capitalist
In [7]:
# Load up the ground truth taxatable
ground_truth_species = pd.read_csv("../results/simulations/taxatable.strain.species.normalized.txt", sep="\t", index_col=0)
In [8]:
head2tax = dict()
with open("../data/references/rep82/rep82.tax") as tax_inf:
for line in tax_inf:
line = line.rstrip().split('\t')
head2tax[line[0]] = line[1]
In [9]:
# load up sequence to simulations
title2acc = dict()
title2tax = dict()
with open("../results/simulations/combined_seqs.fna") as fh:
for line in fh:
if line.startswith(">"):
row = line.split()
title = row[0][1:]
accession = '_'.join(row[1].split('_')[:2])
title2acc[title] = accession
title2tax[title] = head2tax[accession]
In [10]:
def normalize_taxatable(df):
# Features X Samples
# Redistribute to median depth
df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
df_temp = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df_temp.apply(lambda x: x.mean(), axis=1) >= .002]
df[df == 0] = .65
df = df.apply(lambda x: x/x.sum(), axis=0)
data = df.T.values
data = clr(data)
return pd.DataFrame(data, index=df.columns, columns=df.index)
def subset_taxatable(df):
# Redistribute to median depth
df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
df_temp = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df_temp.apply(lambda x: x.mean(), axis=1) >= .002]
return df
def normalize_two_taxatable(df_found, df_gold):
# Features X Samples
df_found = subset_taxatable(df_found)
df_gold = subset_taxatable(df_gold)
df_found = df_found.T
df_found = df_found[[_ in df_gold.columns for _ in df_found.index]].T
df_found.columns = [_ + "_found" for _ in df_found.columns]
# df_found['group'] = ["found" for _ in range(df_found.shape[0])]
# df_gold['group'] = ["gold" for _ in range(df_gold.shape[0])]
df_joined = df_gold.join(df_found)
df_joined = df_joined.fillna(0)
df_joined[df_joined == 0] = .65
df_joined = df_joined.apply(lambda x: x/x.sum(), axis=0)
data = df_joined.T.values
data = clr(data)
df_clr = pd.DataFrame(data, index=df_joined.columns, columns=df_joined.index)
return df_clr[['found' in _ for _ in df_clr.index]], df_clr[['found' not in _ for _ in df_clr.index]]
In [11]:
GOLD_SPECIES = normalize_taxatable(ground_truth_species)
In [ ]:
# Load up the taxonomies for BURST
def fetch_burst_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95', '98']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.burst.b6".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
with open(infile) as fh:
for line in fh:
row = line.rstrip().split("\t")
accession_found = row[1]
accession_actual = title2acc[row[0]]
group = row[0].split('_')[0]
if accession_found == accession_actual:
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
folder = '/'.join(infile.split('/')[:-1])
df = pd.read_csv(folder + "/taxatable.burst.species.normalized.txt", sep="\t", index_col=0)
df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
for group in f1_results.keys():
taxahit_found = set(df[df[group] > 5].index)
taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
series_found = df_norm.T[group + "_found"]
series_actual = df_gold.T[group]
pearson = series_actual.corr(series_found, method='pearson')
spearman = series_actual.corr(series_found, method='spearman')
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['BURST ' + alignment, tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]
In [ ]:
df_burst = pd.DataFrame(fetch_burst_results(), columns=COLUMNS)
df_burst.head()
In [ ]:
df_burst.tail()
In [ ]:
# Load up the taxonomies for BURST
def fetch_bursttaxa_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95', '98']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.burst.b6".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
with open(infile) as fh:
for line in fh:
row = line.rstrip().split("\t")
taxa_found = row[-1]
if 's__' in taxa_found:
taxa_found = ';'.join(taxa_found.split(';')[:7])
taxa_actual = title2tax[row[0]]
group = row[0].split('_')[0]
if taxa_actual.startswith(taxa_found):
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
folder = '/'.join(infile.split('/')[:-1])
df = pd.read_csv(folder + "/taxatable.burst.taxonomy.species.txt", sep="\t", index_col=0)
df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
for group in f1_results.keys():
taxahit_found = set(df[df[group] > 5].index)
taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
series_found = df_norm.T[group + "_found"]
series_actual = df_gold.T[group]
pearson = series_actual.corr(series_found, method='pearson')
spearman = series_actual.corr(series_found, method='spearman')
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['BURST ' + alignment + " Taxonomy", tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]
In [ ]:
df_burst_taxa = pd.DataFrame(fetch_bursttaxa_results(), columns=COLUMNS)
df_burst_taxa.head()
In [ ]:
df_burst_taxa.tail()
In [ ]:
strain2accession = {}
with open("../data/references/rep82/rep82.tax") as inf:
for line in inf:
row = line.rstrip().split()
strain2accession[row[1]] = row[0]
In [ ]:
# f1_results_utree = defaultdict(lambda: defaultdict(int))
# with open("../results/simulations_all_test/combined_seqs.fna_fulldepth_1/alignment.utree.tsv") as inf:
# for line in inf:
# row = line.rstrip().split("\t")
# if row[1] in strain2accession:
# accession_found = strain2accession[row[1]]
# accession_actual = title2acc[row[0]]
# group = row[0].split('_')[0]
# if accession_found == accession_actual:
# f1_results_utree[group]['tp'] += 1
# else:
# f1_results_utree[group]['fp'] += 1
In [ ]:
# Load up the taxonomies for BURST
def fetch_utree_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.utree.tsv".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
with open(infile) as fh:
for line in fh:
row = line.rstrip().split("\t")
taxa_found = row[2]
if row[1] in strain2accession:
accession_found = strain2accession[row[1]]
accession_actual = title2acc[row[0]]
group = row[0].split('_')[0]
if accession_found == accession_actual:
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
folder = '/'.join(infile.split('/')[:-1])
df = pd.read_csv(folder + "/taxatable.utree.species.normalized.txt", sep="\t", index_col=0)
df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
for group in f1_results.keys():
taxahit_found = set(df[df[group] > 5].index)
taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
series_found = df_norm.T[group + "_found"]
series_actual = df_gold.T[group]
pearson = series_actual.corr(series_found, method='pearson')
spearman = series_actual.corr(series_found, method='spearman')
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['UTree', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]
In [ ]:
df_utree = pd.DataFrame(fetch_utree_results(), columns=COLUMNS)
df_utree.head()
In [ ]:
df_utree.tail()
In [ ]:
from shogun.redistribute import Taxonomy
from shogun.utils import build_lca_map
from shogun.parsers import yield_alignments_from_sam_inf
tree = Taxonomy("../data/references/rep82/rep82.tax")
In [ ]:
# Load up the taxonomies for BURST
def fetch_bowtie2_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95', '98']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/alignment.bowtie2.sam".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
align_gen = yield_alignments_from_sam_inf(infile)
lca_map = build_lca_map(align_gen, tree)
for key, value in lca_map.items():
if value:
if 's__' in value:
taxa_found = ';'.join(value.split(';')[:7])
taxa_actual = title2tax[key]
group = key.split('_')[0]
if taxa_actual.startswith(taxa_found):
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
folder = '/'.join(infile.split('/')[:-1])
df = pd.read_csv(folder + "/taxatable.bowtie2.species.normalized.txt", sep="\t", index_col=0)
df_norm, df_gold = normalize_two_taxatable(df, ground_truth_species)
for group in f1_results.keys():
taxahit_found = set(df[df[group] > 5].index)
taxahit_actual = set(ground_truth_species[ground_truth_species[group] > 5].index)
jaccard = len(taxahit_found.intersection(taxahit_actual))/float(len(taxahit_found.union(taxahit_actual)))
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
series_found = df_norm.T[group + "_found"]
series_actual = df_gold.T[group]
pearson = series_actual.corr(series_found, method='pearson')
spearman = series_actual.corr(series_found, method='spearman')
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['Bowtie2 ' + alignment, tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, spearman, pearson, jaccard]
In [ ]:
df_bowtie2 = pd.DataFrame(fetch_bowtie2_results(), columns=COLUMNS)
df_bowtie2.head()
In [ ]:
df_bowtie2.tail()
In [ ]:
df_all = pd.concat([df_burst, df_burst_taxa, df_utree, df_bowtie2])
df_all['depth_log10'] = np.round(np.log10(df_all['depth']))
df_all.to_csv("../figures/simulations_all.txt")
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="recall", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Recall").set_xlabels("Depth (log10)"))
pltname = "recall"
save_plot(g, pltname)
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="spearman", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Spearman").set_xlabels("Depth (log10)"))
pltname = "spearman"
save_plot(g, pltname)
In [ ]:
df_all['aligner'].unique()
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="jaccard", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Jaccard").set_xlabels("Depth (log10)"))
pltname = "jaccard"
save_plot(g, pltname)
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="f1", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("F1 Micro").set_xlabels("Depth (log10)"))
pltname = "f1"
save_plot(g, pltname)
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="precision", hue="aligner", col="site", data=df_all, palette=COLORS)
(g.set_ylabels("Precision").set_xlabels("Depth (log10)"))
pltname = "precision"
save_plot(g, pltname)
In [ ]:
# Load up the taxonomies for BURST
def fetch_centrifuge_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/centrifuge.classification.txt".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
with open(infile) as inf:
next(inf)
for line in inf:
row = line.rstrip().split("\t")
if row[-1] == "1":
accession_found = row[1]
accession_actual = title2acc[row[0]]
group = row[0].split('_')[0]
if accession_found == accession_actual:
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
for group in f1_results.keys():
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['Centrifuge', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, 0, 0, 0]
In [ ]:
df_centrifuge = pd.DataFrame(fetch_centrifuge_results(), columns=COLUMNS)
df_centrifuge.head()
In [ ]:
taxid2accession = dict()
with open("../results/indices/centrifuge_rep82.map") as fh:
for line in fh:
row = line.rstrip().split('\t')
taxid2accession[row[1]] = row[0]
In [ ]:
# Load up the taxonomies for BURST
def fetch_kraken_results():
# Columns = [aligner, depth, tp, fp, fn, precision, recall, f1, level, site, spearman, pearson, jaccard]
# Load up the tp/fp
for depth in DEPTHS:
for alignment in ['95']:
file_str = "../results/simulations_all_{alignment}/combined_seqs.fna_{depth}_*/kraken.report.txt".format(depth=depth, alignment=alignment)
infiles = glob.glob(file_str)
for infile in infiles:
f1_results = defaultdict(lambda: defaultdict(int))
with open(infile) as inf:
for line in inf:
row = line.rstrip().split("\t")
if row[2] in taxid2accession and row[0] == "C":
accession_found = taxid2accession[row[2]]
accession_actual = title2acc[row[1]]
group = row[1].split('_')[0]
if accession_found == accession_actual:
f1_results[group]['tp'] += 1
else:
f1_results[group]['fp'] += 1
for group in f1_results.keys():
tp = f1_results[group]['tp']
fp = f1_results[group]['fp']
fn = int(DEPTHS_DICT[depth]/3.)-(tp+fp)
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
f1_score = (precision+recall)/2.
level = 'species'
site = group
# df_temp = pd.DataFrame([series_found, series_actual]).T
# df_temp.columns = ['found', 'actual']
# df_temp = df_temp.fillna(0)
# d = dice(df_temp['found'], df_temp['actual'])
yield ['Kraken', tp+fp+fn, tp, fp, fn, precision, recall, f1_score, level, site, 0, 0, 0]
In [ ]:
df_kraken = pd.DataFrame(fetch_kraken_results(), columns=COLUMNS)
df_kraken.head()
In [ ]:
df_all = pd.concat([df_burst, df_burst_taxa, df_utree, df_bowtie2, df_kraken, df_centrifuge])
df_all['depth_log10'] = np.round(np.log10(df_all['depth']))
df_all.to_csv("../figures/simulations_kraken_centrifuge.txt")
In [ ]:
df_kraken.head()
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="f1", hue="aligner", col="site", data=df_all)
(g.set_ylabels("F1 Micro").set_xlabels("Depth (log10)"))
pltname = "f1_kraken_centrifuge"
save_plot(g, pltname)
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="precision", hue="aligner", col="site", data=df_all)
(g.set_ylabels("Precision").set_xlabels("Depth (log10)"))
pltname = "precision_kraken_centrifuge"
save_plot(g, pltname)
In [ ]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: Recall
# x-axis: N (workers)
g = sns.factorplot(x="depth_log10", y="recall", hue="aligner", col="site", data=df_all)
(g.set_ylabels("Recall").set_xlabels("Depth (log10)"))
pltname = "recall_kraken_centrifuge"
save_plot(g, pltname)