In [20]:
from glob import glob
import os
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
COLORS=dict(zip(["burst", "utree", "centrifuge", "kraken", "bowtie2", 'burst-capitalist', 'burst-lca', 3, 4, 5], sns.color_palette("colorblind", 10)))
sns.palplot(sns.color_palette("colorblind", 10))
%matplotlib inline
In [2]:
# Load up RS2KO
protein2kos = defaultdict(list)
with open("../results/uds/rs2ko.txt") as inf:
for line in inf:
row = line.rstrip().split("\t")
row[0] = row[0].split('.')[0]
protein2kos[row[0]].extend(row[1].split())
In [77]:
def save_plot(fig, pltname, artists=()):
fig.savefig(os.path.join("..", "figures", "uds_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')
In [3]:
# Load up GMG.to.KO
rs2kos = defaultdict(list)
with open("../results/uds/./GMG.microbe.to.KO.map.txt") as inf:
for line in inf:
row = line.rstrip().split("\t")
row[0] = row[0].split('.')[0]
rs2kos[row[0]].extend(row[1].split())
In [4]:
rep82_R1 = glob("../results/uds/*_R1_001.rep82.b6")
rep82_set = set(rep82_R1)
miniGMG_R1 = glob("../results/uds/*_R1_001.miniGMG_darth.b6")
processing_pairs = []
for file in miniGMG_R1:
rep82_file = file.replace("miniGMG_darth", "rep82")
if rep82_file in rep82_set:
processing_pairs.append([file, rep82_file])
In [5]:
# Given a string, and start and end string, return the sandwiched string within the the original string
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
In [6]:
for miniGMG, rep82 in processing_pairs:
output_folder = rep82.replace(".rep82.b6", "")
hit = 0
missed = 0
basename = os.path.basename(output_folder)
counter = Counter()
if not os.path.exists(output_folder):
os.makedirs(output_folder)
!shogun assign_taxonomy -o {output_folder}/taxatable.{basename}.txt -i {rep82} -d ../data/references/rep82/ -a burst
!shogun functional -i {output_folder}/taxatable.{basename}.txt -l strain -d ../data/references/rep82/ -o {output_folder}
!shogun functional -i {output_folder}/taxatable.{basename}.txt -l species -d ../data/references/rep82/ -o {output_folder}
!shogun functional -i {output_folder}/taxatable.{basename}.txt -l genus -d ../data/references/rep82/ -o {output_folder}
with open(miniGMG) as inf:
with open(output_folder + "/keggs.perline.txt", "w") as outf:
with open(output_folder + "/rid.missed.txt", "w") as outf_m:
for line in inf:
row = line.rstrip().split("\t")
kegg_id_found = False
if "protein_id" in row[1]:
proteins = find_between(row[1], "[protein_id=", "]").split(",")
proteins = [protein.split(".")[0] for protein in proteins]
for protein in proteins:
if protein in protein2kos:
counter.update(protein2kos[protein])
outf.write("%s\n" % " ".join(protein2kos[protein]))
kegg_id_found = True
hit += 1
if kegg_id_found == False:
refseq_id = find_between(row[1], 'lcl|', ' ')
refseq_id = refseq_id.split('.')[0]
if refseq_id in rs2kos:
counter.update(rs2kos[refseq_id])
outf.write('%s\n' % " ".join(rs2kos[refseq_id]))
kegg_id_found = True
hit += 1
else:
missed += 1
outf_m.write("%s\n" % protein)
print("%s\t%d\t%d\t%.2f" % (miniGMG, hit, missed, float(hit)/(hit+missed)))
with open(output_folder + "/keggs.output.txt", "w") as outf:
for key, value in counter.items():
outf.write("%s\t%s\n" % (key, value))
In [7]:
# Merge taxatables
def fetch_merged_keggtable(level):
kegg_ids = defaultdict(int)
files = glob("../results/uds/**/*.{}.kegg.txt".format(level))
for file in files:
with open(file) as inf:
next(inf)
for line in inf:
row = line.rstrip().split("\t")
kegg_ids[row[0]] += int(row[1])
return pd.Series(kegg_ids)
In [8]:
df_kegg = fetch_merged_keggtable("strain")
print(df_kegg.sum())
df_kegg.shape
Out[8]:
In [9]:
# Merge taxatables
def fetch_merged_minigmg():
kegg_ids = defaultdict(int)
files = glob("../results/uds/**/keggs.output.txt")
for file in files:
with open(file) as inf:
for line in inf:
row = line.rstrip().split("\t")
kegg_ids[row[0]] += int(row[1])
return pd.Series(kegg_ids)
In [10]:
df_minigmg = fetch_merged_minigmg()
df_minigmg
Out[10]:
In [11]:
mini_set = set(df_minigmg.index)
kegg_set = set(df_kegg.index)
In [12]:
print(len(mini_set.intersection(kegg_set)))
In [14]:
print(len(mini_set))
In [15]:
print(len(kegg_set))
In [91]:
# Merge taxatables
def fetched_hitrates(fulldepth_series):
fulldepth_set = set(fulldepth_series.index)
kegg_dict = defaultdict(int)
files = glob("../results/uds/**/keggs.perline.txt")
i = 0
for file in files:
with open(file) as inf:
for line in inf:
line = line.rstrip()
if not line == "":
i += 1
kegg_dict[line] += 1
if i % 1000 == 1:
series = pd.Series(kegg_dict)
kegg_set = set(kegg_dict.keys())
yield i, series.corr(fulldepth_series, method='spearman'), len(fulldepth_set.intersection(kegg_set))/float(len(fulldepth_set.union(kegg_set))), series.corr(fulldepth_series, method='pearson')
In [92]:
df_kegg_small = df_kegg[[_ in mini_set for _ in df_kegg.index]]
In [93]:
df_running = pd.DataFrame(fetched_hitrates(df_kegg_small), columns=["depth", "spearman", "jaccard", "pearson"])
df_running.head()
Out[93]:
In [94]:
df_running = df_running.fillna(0)
In [95]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: jaccard
# x-axis: sequencing depth
x = np.array([i*1001 for i in range(len(jaccards))])
fig, ax = plt.subplots()
plt.plot(df_running['depth'], df_running['jaccard'])
plt.title("Directly Observed Genes to Predicted")
plt.ylabel("Jaccard Similarity")
plt.xlabel("Sequencing Depth")
save_plot(fig, 'jaccard')
In [96]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: jaccard
# x-axis: sequencing depth
fig, ax = plt.subplots()
plt.plot(df_running['depth'], df_running['spearman'])
plt.title("Directly Observed Genes to Predicted")
plt.ylabel("Spearman Correlation")
plt.xlabel("Sequencing Depth")
save_plot(fig, 'spearman')
In [97]:
# We are interested in the speed up and efficiency of each of the aligners
# y-axis: jaccard
# x-axis: sequencing depth
fig, ax = plt.subplots()
plt.plot(df_running['depth'], df_running['pearson'])
plt.title("Directly Observed Genes to Predicted")
plt.ylabel("Pearson Correlation")
plt.xlabel("Sequencing Depth")
save_plot(fig, 'pearson')
In [61]:
df_kegg.corr(df_minigmg, method="spearman")
Out[61]:
In [63]:
df_kegg.index
Out[63]: