In [1]:
import pandas as pd
import re
from glob import glob
import itertools
In [2]:
def hmmer_parser(filename,header=None,domtbl=False):
col_number = 19 if not domtbl else 23
data_lines = []
with open(filename) as fh:
for line in fh:
line = line.rstrip("\n")
if line.startswith("#") or line == "":
continue
data_lines.append( re.split(r" +",line,maxsplit=col_number-1) )
return pd.DataFrame.from_records( data_lines, columns=header)
In [3]:
hmmer_folder_regex = re.compile(r"hmmer_vs_([a-zA-Z0-9_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
hmmer_tsv_header = ["subject_id","s_accession","query_id","q_accession" ,
"e_value","bitscore","bias", "best_dmn_e_value","best_dmn_bitscore","best_dmn_bias",
"dne_exp","dne_reg","dne_clu","dne_ov", "dne_env",
"dne_dom", "dne_rep", "dne_inc", "s_description"]
hmmer_hits = []
for folder in glob("../1_run_db_searches/hmmer_160324/hmmer_vs_*"):
db_id = hmmer_folder_regex.search(folder).group(1)
for hmmer_filename in glob(folder+"/*.tsv"):
hmmer_hits.append( hmmer_parser(hmmer_filename,hmmer_tsv_header) )
hmmer_hits[-1]["cluster"] = cluster_id_regex.search(hmmer_filename).group(1)
hmmer_hits[-1]["tool"] = "hmmsearch"
hmmer_hits[-1]["db"] = db_id
#Add hits to metahit 2014 igc
for folder in glob("../1_run_db_searches/metahit_2014_160620/hmmer_vs_*"):
db_id = hmmer_folder_regex.search(folder).group(1)
for hmmer_filename in glob(folder+"/*.tsv"):
hmmer_hits.append( hmmer_parser(hmmer_filename,hmmer_tsv_header) )
hmmer_hits[-1]["cluster"] = cluster_id_regex.search(hmmer_filename).group(1)
hmmer_hits[-1]["tool"] = "hmmsearch"
hmmer_hits[-1]["db"] = db_id
all_hmmer_hits = pd.concat(hmmer_hits,axis=0)
print(all_hmmer_hits.shape)
all_hmmer_hits.head()
Out[3]:
In [4]:
#Convert all numeric columns to float for manipulation
for col in ["e_value","bitscore","bias"] + [x for x in all_hmmer_hits.columns if ("best_dmn" in x or "dne_" in x )] :
all_hmmer_hits[col] = pd.to_numeric(all_hmmer_hits[col])
In [5]:
filt_hits = all_hmmer_hits[ (all_hmmer_hits.e_value < 1e-3) & (all_hmmer_hits.best_dmn_e_value < 1e-3) ]
filt_hits.to_csv("1_out/filtered_hmmer_all_hits.csv",index=False)
print(filt_hits.shape)
filt_hits.head()
Out[5]:
In [6]:
gb = filt_hits.groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id",
"bitscore","e_value","s_description","best_dmn_e_value"]]
sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.to_csv("1_out/filtered_hmmer_best_hits.csv",index=False)
print(sorted_fam_hits.shape)
sorted_fam_hits.head()
Out[6]:
In [ ]: