In [61]:
%matplotlib inline
import pandas as pd
from dojo.taxonomy import NCBITree
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import matplotlib.pyplot as plt
import matplotlib
In [62]:
output_notebook()
In [63]:
tree = NCBITree()
In [64]:
# Read in results table and normalize
df_emb_normalized = pd.read_csv("../results/mp2_gold/rep82/embalmer_taxatable.species.normalized.txt", sep="\t", index_col=0)
df_emb_normalized = df_emb_normalized.apply(pd.to_numeric)
In [65]:
# Group into same samples to check variance
groups = [".".join(_.split(".")[:3]) for _ in df_emb_normalized.columns]
In [66]:
# Check the standard in the observed species per group
df_emb_normalized = df_emb_normalized.T
grouping = df_emb_normalized.apply(lambda x: x > 0).groupby(groups)
In [ ]:
# Print out the results
grouping.sum(axis=1).std(axis=1)
In [ ]:
# Accumulate gold standard information
gold_standard = "../data/mp2_gold/results.txt"
!head {gold_standard}
!wc -l {gold_standard}
In [67]:
# Parse the gold standard taxatable
counts_dict = {}
with open(gold_standard) as inf:
for title in inf:
sample_name = title.split()[1]
ncbi_tids = next(inf).split()
counts = next(inf).split()
counts_dict[sample_name] = dict(zip(ncbi_tids, counts))
df_gold = pd.DataFrame(counts_dict)
df_gold = df_gold.fillna(0)
# Save to file
df_gold.to_csv("../results/mp2_gold/gold.ncbitid.txt", sep='\t', float_format="%d")
In [ ]:
In [ ]:
In [68]:
df_gold.head()
Out[68]:
In [11]:
# Used DOJO instead
#!cp ../data/references/a2t.rs81.txt /dev/shm/
# a2t_file_path = "/dev/shm/a2t.rs81.txt"
# !head {a2t_file_path}
In [12]:
# Used DOJO instead
# a2t = {}
# import csv
# with open(a2t_file_path) as inf:
# csv_inf = csv.reader(inf, delimiter="\t")
# next(csv_inf)
# for row in csv_inf:
# a2t[row[1]] = row[2]
# index_mapping = [a2t[_] for _ in df_gold.index if _ in a2t]d
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [69]:
index_mapping = [tree.green_genes_lineage(int(taxid), depth=8, depth_force=True) for taxid in df_gold.index]
df_gold["green_genes_tax"] = index_mapping
df_gold.to_csv("../results/mp2_gold/gold.strain.txt", sep='\t', float_format="%d")
species = [";".join(_.split(";")[:-1]) for _ in df_gold["green_genes_tax"]]
df_gold_tmp = df_gold.drop("green_genes_tax", axis=1)
df_species = df_gold_tmp.groupby(species).sum(axis=0)
df_species = df_species.fillna(0)
df_species = df_species.apply(pd.to_numeric).astype(int)
df_species.to_csv("../results/mp2_gold/gold.species.txt", sep='\t', float_format="%d")
In [ ]:
In [70]:
# Index must be taxonomy
def summarize_df_at_level(df, level):
level = [";".join(_.split(";")[:level]) for _ in df.index]
return df.groupby(level).sum(axis=0)
def remove_plasmids(df):
level = [_ if not "Plasmid" in _ else _.replace("Plasmid", "") for _ in df.index]
return df.groupby(level).sum(axis=0)
In [73]:
# Analyze results at all levels
spearman_results = []
for i in range(7):
level = i + 1
if level == 7:
_df_obs = df_emb_normalized
_df_gold = df_species
else:
_df_obs = summarize_df_at_level(df_emb_normalized.T, level)
_df_obs = remove_plasmids(_df_obs).T
_df_gold = summarize_df_at_level(df_species, level)
for i, row in _df_obs.iterrows():
_temp = row[row > 0]
_temp2 = _df_gold[row.name]
_temp2 = _temp2[_temp2 > 0]
_test_set = set(_temp.index)
_gold_set = set(_temp2.index)
_intersection = _test_set.intersection(_gold_set)
_union = _test_set.union(_gold_set)
_hits_obs = _temp.sum()
_hits_gold = _temp2.sum()
spearman_results.append([row.name, level, _temp.corr(_temp2, method="spearman"), len(_test_set), len(_gold_set), len(_intersection), len(_union), len(_intersection)/len(_union), _hits_obs, _hits_gold])
print("Done with level: %d" % level)
df_results = pd.DataFrame(spearman_results, columns=["sample", "level", "spearman", "num_observed", "num_actual", "num_intersection", "num_union", "percent_intersection", "hits_obs", "hits_gold"])
In [75]:
df_results.tail()
Out[75]:
In [52]:
# Missing a Kingdom check
df_obs_kingdom = summarize_df_at_level(df_emb_normalized.T, 1).T
print(df_obs_kingdom.columns)
print(remove_plasmids(summarize_df_at_level(df_emb_normalized.T, 1)).T.columns)
In [22]:
# Missing a Kingdom check
df_gold_kingdom = summarize_df_at_level(df_species, 1).T
df_gold_kingdom.columns
Out[22]:
In [59]:
tax = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
# Make error bars plots
depths = [int(_.split('.')[-2]) if _[-5:] != 'ninja' else 40000000 for _ in df_results["sample"]]
df_results["depths"] = np.log10(depths)
for i in range(7):
level = i + 1
df_results_kingdom = df_results[df_results["level"] == level]
groups = [".".join(_.split(".")[:-1]) for _ in df_results_kingdom["sample"]]
df_results_kingdoms_groups = df_results_kingdom.groupby("depths").median()
df_results_kingdoms_groups_max = df_results_kingdom.groupby("depths").quantile(.75)
df_results_kingdoms_groups_min = df_results_kingdom.groupby("depths").quantile(.25)
plt.figure()
plt.title("BURST results on MP2 Gold at Level %s" % tax[i])
plt.ylim(-.05, 1.05)
plt.xlim(2, 8)
plt.xlabel("Sequencing Depth (Log10)")
plt.ylabel("% Intersection of Taxonomies Observed")
plt.errorbar(df_results_kingdoms_groups.index, df_results_kingdoms_groups["percent_intersection"], yerr=[df_results_kingdoms_groups["percent_intersection"]-df_results_kingdoms_groups_min["percent_intersection"], df_results_kingdoms_groups_max["percent_intersection"]-df_results_kingdoms_groups["percent_intersection"]], fmt='--o', ecolor='g', capthick=2)
# plt.show()
plt.savefig("../results/mp2_gold/plots/rep82_mp2_gold_{}_{}.pdf".format(tax[i], 'taxjac'), bbox_inches="tight")
In [60]:
tax = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
# Make error bars plots
depths = [int(_.split('.')[-2]) if _[-5:] != 'ninja' else 40000000 for _ in df_results["sample"]]
df_results["depths"] = np.log10(depths)
for i in range(7):
level = i + 1
df_results_kingdom = df_results[df_results["level"] == level]
groups = [".".join(_.split(".")[:-1]) for _ in df_results_kingdom["sample"]]
df_results_kingdoms_groups = df_results_kingdom.groupby("depths").median()
df_results_kingdoms_groups_max = df_results_kingdom.groupby("depths").quantile(.75)
df_results_kingdoms_groups_min = df_results_kingdom.groupby("depths").quantile(.25)
plt.figure()
plt.title("BURST results on MP2 Gold at Level %s" % tax[i])
plt.ylim(.5, 1.05)
plt.xlim(2, 8)
plt.xlabel("Sequencing Depth (Log10)")
plt.ylabel("Spearman Correlation")
plt.errorbar(df_results_kingdoms_groups.index, df_results_kingdoms_groups["spearman"], yerr=[df_results_kingdoms_groups["spearman"]-df_results_kingdoms_groups_min["spearman"], df_results_kingdoms_groups_max["spearman"]-df_results_kingdoms_groups["spearman"]], fmt='--o', ecolor='g', capthick=2)
# plt.show()
plt.savefig("../results/mp2_gold/plots/rep82_mp2_gold_{}_{}.pdf".format(tax[i], 'spearman'), bbox_inches="tight")
In [25]:
def drop_low_abundance(df):
depths_thresh_per_sample = df.sum(axis=0)*.02
for name, thresh in zip(df.columns, depths_thresh_per_sample):
df.loc[df[name] <= thresh, name] = 0
return df
In [26]:
spearman_results = []
# Analyze results at all levels dropping low abundance
for i in range(7):
level = i + 1
if level == 7:
_df_obs = df_emb_normalized
_df_gold = df_species
else:
_df_obs = summarize_df_at_level(df_emb_normalized.T, level).T
_df_gold = summarize_df_at_level(df_species, level)
_df_obs = drop_low_abundance(_df_obs.T).T
_df_gold = drop_low_abundance(_df_gold)
for i, row in _df_obs.iterrows():
_temp = row[row > 0]
_temp2 = _df_gold[row.name]
_temp2 = _temp2[_temp2 > 0]
_test_set = set(_temp.index)
_gold_set = set(_temp2.index)
_intersection = _test_set.intersection(_gold_set)
_union = _test_set.union(_gold_set)
spearman_results.append([row.name, level, _temp.corr(_temp2, method="spearman"), len(_test_set), len(_gold_set), len(_intersection), len(_union), len(_intersection)/len(_union)])
print("Done with level: %d" % level)
df_results_thresh = pd.DataFrame(spearman_results, columns=["sample", "level", "spearman", "num_observed", "num_actual", "num_intersection", "num_union", "percent_intersection"])
In [45]:
tax = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
# Make error bars plots
depths = [int(_.split('.')[-2]) if _[-5:] != 'ninja' else 40000000 for _ in df_results["sample"]]
df_results_thresh["depths"] = np.log10(depths)
for i in range(7):
level = i + 1
df_results_kingdom = df_results_thresh[df_results_thresh["level"] == level]
groups = [".".join(_.split(".")[:-1]) for _ in df_results_kingdom["sample"]]
df_results_kingdoms_groups = df_results_kingdom.groupby("depths").median()
df_results_kingdoms_groups_max = df_results_kingdom.groupby("depths").quantile(.75)
df_results_kingdoms_groups_min = df_results_kingdom.groupby("depths").quantile(.25)
plt.figure()
plt.title("BURST results on MP2 Gold at Level %s" % tax[i])
plt.ylim(-.05, 1.05)
plt.xlim(2, 8)
plt.xlabel("Sequencing Depth (Log10)")
plt.ylabel("% Intersection of Taxonomies Observed")
plt.errorbar(df_results_kingdoms_groups.index, df_results_kingdoms_groups["percent_intersection"], yerr=[df_results_kingdoms_groups["percent_intersection"]-df_results_kingdoms_groups_min["percent_intersection"], df_results_kingdoms_groups_max["percent_intersection"]-df_results_kingdoms_groups["percent_intersection"]], fmt='--o', ecolor='g', capthick=2)
# plt.show()
plt.savefig("../results/mp2_gold/plots/rep82_mp2_gold_{}_{}_filter.pdf".format(tax[i], 'taxjac'), bbox_inches="tight")
In [46]:
tax = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
# Make error bars plots
depths = [int(_.split('.')[-2]) if _[-5:] != 'ninja' else 40000000 for _ in df_results["sample"]]
df_results["depths"] = np.log10(depths)
for i in range(7):
level = i + 1
df_results_kingdom = df_results_thresh[df_results_thresh["level"] == level]
groups = [".".join(_.split(".")[:-1]) for _ in df_results_kingdom["sample"]]
df_results_kingdoms_groups = df_results_kingdom.groupby("depths").median()
df_results_kingdoms_groups_max = df_results_kingdom.groupby("depths").quantile(.75)
df_results_kingdoms_groups_min = df_results_kingdom.groupby("depths").quantile(.25)
plt.figure()
plt.title("BURST results on MP2 Gold at Level %s" % tax[i])
plt.ylim(.5, 1.05)
plt.xlim(2, 8)
plt.xlabel("Sequencing Depth (Log10)")
plt.ylabel("Spearman Correlation")
plt.errorbar(df_results_kingdoms_groups.index, df_results_kingdoms_groups["spearman"], yerr=[df_results_kingdoms_groups["spearman"]-df_results_kingdoms_groups_min["spearman"], df_results_kingdoms_groups_max["spearman"]-df_results_kingdoms_groups["spearman"]], fmt='--o', ecolor='g', capthick=2)
# plt.show()
plt.savefig("../results/mp2_gold/plots/rep82_mp2_gold_{}_{}_filter.pdf".format(tax[i], 'spearman'), bbox_inches="tight")