In [1]:
import glob
import re
from collections import defaultdict
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
filename_regex = re.compile(r"cluster(.+?)_(?P<tool>.+?)_vs_(?P<db>.+)\.tsv")
e_value_threshold = 0.001

In [3]:
def parse_blast_results(folder):
    blast_results = {}
    for tsv_file in glob.glob("{}/*.tsv".format(folder)):
        cluster,tool,db = filename_regex.search(tsv_file).groups()
        min_eval = 3
        #Parse blast tabular results and get smaller e-value of all seqs in a cluster
        with open(tsv_file, "r" ) as tsv_fh:
            for line in tsv_fh:
                min_eval = min(min_eval, float(line.split("\t")[10])  )
        #Store 
        blast_results[cluster] = ( (min_eval if min_eval <= e_value_threshold else "-",
                                    tsv_file.replace(".tsv",".html")))
    return blast_results

In [4]:
def parse_hmmer_results(folder):
    hmmer_results = {}
    for tsv_file in glob.glob("{}/*.tsv".format(folder)):
        cluster,tool,db = filename_regex.search(tsv_file).groups()
        min_eval = 3
        #Parse hmmer fixed col results and get smaller e-value of all seqs in a cluster
        with open(tsv_file, "r" ) as tsv_fh:
            for line in tsv_fh:
                if not line.startswith("#") and len(line) > 1:
                    min_eval = min(min_eval, float(re.split(" +",line,maxsplit=9)[7])  )
        #Store 
        hmmer_results[cluster] = ( (min_eval if min_eval <= e_value_threshold else "-",
                                    tsv_file.replace(".tsv",".txt")))
    return hmmer_results

Parse pipeline stats from html


In [5]:
soup = BeautifulSoup(open("orfan_metrics/summary_all_20111209.html"), 'html.parser')
#Extract header
header = [str(x.string) for x in soup.findAll("thead")[0].findAll("tr")[1].findAll("th")][:-4]
#Extract values
rows = [ tuple(str(element.contents[0]) for element in row.findAll("td")[:-4]) for row in soup.findAll("tr") ]
#Summarize data in pandas dataframe
df = pd.DataFrame.from_records(rows[2:], columns=header)
df.index = df["Cluster"]
df = df.sort_values(by ="composite score",ascending=False)
#Subset columns
orfan_metrics = df[["ORF length (aa)","composite score"]]
orfan_metrics.head()


Out[5]:
ORF length (aa) composite score
Cluster
1217 91 7.57
339b 110 7.52
211b 80 7.49
562 61 7.39
297a 106 7.38

Parse blast and hmmer results


In [6]:
blast_results = {
    "blastn_vs_nt":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_nt"),
    "blastn_vs_env_nt":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_env_nt"),
    "blastn_vs_hmp_nuc":parse_blast_results("../1_run_db_searches/blast_160427/blastn_vs_hmp_nuc"),
#    "blastn_vs_metahit_cds":parse_blast_results("blast_160427/blastn_vs_metahit_cds"),
    "blastn_vs_metahit_2014_cds":parse_blast_results("../1_run_db_searches/metahit_2014_160620/blastn_vs_metahit_2014_cds/"),
    "blastp_vs_nr":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_nr"),
    "blastp_vs_env_nr":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_env_nr"),
    "blastp_vs_hmp_pep":parse_blast_results("../1_run_db_searches/blast_160427/blastp_vs_hmp_pep"),
#    "blastp_vs_metahit_pep":parse_blast_results("blast_160427/blastp_vs_metahit_pep"),
    "blastp_vs_metahit_2014_pep":parse_blast_results("../1_run_db_searches/metahit_2014_160620/blastp_vs_metahit_2014_pep/"),
    "hmmer_vs_nr":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_nr"),
    "hmmer_vs_env_nr":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_env_nr"),
    "hmmer_vs_hmp_pep":parse_hmmer_results("../1_run_db_searches/hmmer_160324/hmmer_vs_hmp_pep"),
#    "hmmer_vs_metahit_pep":parse_hmmer_results("hmmer_160324/hmmer_vs_metahit_pep"),
    "hmmer_vs_metahit_2014_pep":parse_hmmer_results("../1_run_db_searches/metahit_2014_160620/hmmer_vs_metahit_2014_pep/")
}

#Verify all searches have the same number of clusters
print("\n".join([str((x,len(blast_results[x]))) for x in blast_results]))

#cluster_names = [x for x in blast_results["blastn_vs_nt"]]
cluster_names = [x for x in blast_results["blastn_vs_hmp_nuc"]]


('blastn_vs_hmp_nuc', 32)
('hmmer_vs_env_nr', 32)
('hmmer_vs_metahit_2014_pep', 32)
('blastn_vs_metahit_2014_cds', 32)
('hmmer_vs_hmp_pep', 32)
('blastn_vs_env_nt', 32)
('hmmer_vs_nr', 32)
('blastp_vs_metahit_2014_pep', 32)
('blastn_vs_nt', 32)
('blastp_vs_nr', 32)
('blastp_vs_env_nr', 32)
('blastp_vs_hmp_pep', 32)

In [7]:
import yattag

In [8]:
doc, tag, text = yattag.Doc().tagtext()
with tag("html"):
    with tag("head"):
        with tag("title"):
            text("Blast result summary")
        doc.stag("link",rel="stylesheet",href="summaryreport.css")

    with tag("body"):
        with tag("table"):
            with tag("thead"):
                with tag("tr"):
                    for header in ["Cluster","orf length(aa)","composite score"]+ sorted(blast_results):
                        with tag("th"):
                            text(" vs ".join(header.split("_vs_")))
            with tag("tbody"):
                for cluster,cluster_stats in orfan_metrics.iterrows():
                    with tag("tr"):
                        with tag("td"):
                            text(cluster)
                        for stat in cluster_stats:
                            with tag("td"):
                                text(stat)
                        for search in sorted(blast_results):
                            with tag("td"):
                                with tag("a",href=blast_results[search][cluster][1]):
                                    text(blast_results[search][cluster][0])

In [9]:
!mkdir -p 2_out
!cp summaryreport.css 2_out
with open("2_out/db_search_summary.html","w") as out_fh:
    out_fh.write(yattag.indent(doc.getvalue()))

In [ ]: