In [1]:
import glob
import re
from collections import defaultdict
from bs4 import BeautifulSoup
import pandas as pd
In [2]:
filename_regex = re.compile(r"cluster(.+?)_(?P<tool>.+?)_vs_(?P<db>.+)\.tsv")
e_value_threshold = 0.001
In [3]:
def parse_blast_results(folder):
blast_results = {}
for tsv_file in glob.glob("{}/*.tsv".format(folder)):
cluster,tool,db = filename_regex.search(tsv_file).groups()
min_eval = 3
#Parse blast tabular results and get smaller e-value of all seqs in a cluster
with open(tsv_file, "r" ) as tsv_fh:
for line in tsv_fh:
min_eval = min(min_eval, float(line.split("\t")[10]) )
#Store
blast_results[cluster] = ( (min_eval if min_eval <= e_value_threshold else "-",
tsv_file.replace(".tsv",".html")))
return blast_results
In [4]:
def parse_hmmer_results(folder):
hmmer_results = {}
for tsv_file in glob.glob("{}/*.tsv".format(folder)):
cluster,tool,db = filename_regex.search(tsv_file).groups()
min_eval = 3
#Parse hmmer fixed col results and get smaller e-value of all seqs in a cluster
with open(tsv_file, "r" ) as tsv_fh:
for line in tsv_fh:
if not line.startswith("#") and len(line) > 1:
min_eval = min(min_eval, float(re.split(" +",line,maxsplit=9)[7]) )
#Store
hmmer_results[cluster] = ( (min_eval if min_eval <= e_value_threshold else "-",
tsv_file.replace(".tsv",".txt")))
return hmmer_results
In [5]:
soup = BeautifulSoup(open("orfan_metrics/summary_all_20111209.html"), 'html.parser')
#Extract header
header = [str(x.string) for x in soup.findAll("thead")[0].findAll("tr")[1].findAll("th")][:-4]
#Extract values
rows = [ tuple(str(element.contents[0]) for element in row.findAll("td")[:-4]) for row in soup.findAll("tr") ]
#Summarize data in pandas dataframe
df = pd.DataFrame.from_records(rows[2:], columns=header)
df.index = df["Cluster"]
df = df.sort_values(by ="composite score",ascending=False)
#Subset columns
orfan_metrics = df[["ORF length (aa)","composite score"]]
orfan_metrics.head()
Out[5]:
In [6]:
blast_results = {
"blastn_vs_nt":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_nt"),
"blastn_vs_env_nt":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_env_nt"),
"blastn_vs_hmp_nuc":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_hmp_nuc"),
# "blastn_vs_metahit_cds":parse_blast_results("blast_160427/blastn_vs_metahit_cds"),
"blastn_vs_metahit_2014_cds":parse_blast_results("../1_run_db_searches/metahit_2014_160620/blastn_vs_metahit_2014_cds/"),
"blastp_vs_nr":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_nr"),
"blastp_vs_env_nr":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_env_nr"),
"blastp_vs_hmp_pep":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_hmp_pep"),
# "blastp_vs_metahit_pep":parse_blast_results("blast_160427/blastp_vs_metahit_pep"),
"blastp_vs_metahit_2014_pep":parse_blast_results("../1_run_db_searches/metahit_2014_160620/blastp_vs_metahit_2014_pep/"),
"hmmer_vs_nr":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_nr"),
"hmmer_vs_env_nr":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_env_nr"),
"hmmer_vs_hmp_pep":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_hmp_pep"),
# "hmmer_vs_metahit_pep":parse_hmmer_results("hmmer_160324/hmmer_vs_metahit_pep"),
"hmmer_vs_metahit_2014_pep":parse_hmmer_results("../1_run_db_searches/metahit_2014_160620/hmmer_vs_metahit_2014_pep/")
}
#Verify all searches have the same number of clusters
print("\n".join([str((x,len(blast_results[x]))) for x in blast_results]))
#cluster_names = [x for x in blast_results["blastn_vs_nt"]]
cluster_names = [x for x in blast_results["blastn_vs_hmp_nuc"]]
In [7]:
import yattag
In [8]:
doc, tag, text = yattag.Doc().tagtext()
with tag("html"):
with tag("head"):
with tag("title"):
text("Blast result summary")
doc.stag("link",rel="stylesheet",href="summaryreport.css")
with tag("body"):
with tag("table"):
with tag("thead"):
with tag("tr"):
for header in ["Cluster","orf length(aa)","composite score"]+ sorted(blast_results):
with tag("th"):
text(" vs ".join(header.split("_vs_")))
with tag("tbody"):
for cluster,cluster_stats in orfan_metrics.iterrows():
with tag("tr"):
with tag("td"):
text(cluster)
for stat in cluster_stats:
with tag("td"):
text(stat)
for search in sorted(blast_results):
with tag("td"):
with tag("a",href=blast_results[search][cluster][1]):
text(blast_results[search][cluster][0])
In [9]:
!mkdir -p 2_out
!cp summaryreport.css 2_out
with open("2_out/db_search_summary.html","w") as out_fh:
out_fh.write(yattag.indent(doc.getvalue()))
In [ ]: