In [1]:
import pandas as pd
import re
from glob import glob
import itertools

In [2]:
def hmmer_parser(filename,header=None):
    data_lines = []
    with open(filename) as fh:
        for line in fh:
            line = line.rstrip("\n")
            if line.startswith("#") or line == "":
                continue
            data_lines.append( re.split(r" +",line,maxsplit=18) ) 
    return pd.DataFrame.from_records( data_lines, columns=header)

In [3]:
hmmer_folder_regex = re.compile(r"hmmer_vs_([a-zA-Z_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
hmmer_tsv_header = ["subject_id","s_accession","query_id","q_accession" ,
                    "e_value","bitscore","bias", "best_dmn_e_value","best_dmn_bitscore","best_dmn_bias",
                    "dne_exp","dne_reg","dne_clu","dne_ov", "dne_env", 
                    "dne_dom", "dne_rep", "dne_inc", "s_description"]

hmmer_hits = []
for folder in glob("hmmer_160324/hmmer_vs_*"):
    db_id = hmmer_folder_regex.search(folder).group(1)
    for hmmer_filename in glob(folder+"/*.tsv"):
        hmmer_hits.append( hmmer_parser(hmmer_filename,hmmer_tsv_header) )
        hmmer_hits[-1]["cluster"] = cluster_id_regex.search(hmmer_filename).group(1)
        hmmer_hits[-1]["tool"] = "hmmsearch"
        hmmer_hits[-1]["db"] = db_id
        
all_hmmer_hits = pd.concat(hmmer_hits,axis=0)
print(all_hmmer_hits.shape)
all_hmmer_hits.head()


(1255, 22)
Out[3]:
subject_id s_accession query_id q_accession e_value bitscore bias best_dmn_e_value best_dmn_bitscore best_dmn_bias ... dne_clu dne_ov dne_env dne_dom dne_rep dne_inc s_description cluster tool db
0 gi|135090735|gb|EBF43246.1| - cluster1057 - 0.64 18.7 0.7 0.64 18.7 0.7 ... 1 2 4 4 4 0 hypothetical protein GOS_9598556 [marine metag... 1057 hmmsearch env_nr
0 gi|138678869|gb|ECB25512.1| - cluster1073 - 0.38 19.4 0.2 0.46 19.1 0.2 ... 0 0 1 1 1 0 hypothetical protein GOS_5095829, partial [mar... 1073 hmmsearch env_nr
0 gi|140329223|gb|ECL02226.1| - cluster1085 - 0.00088 27.6 2.0 0.0011 27.3 0.7 ... 0 0 2 2 1 1 hypothetical protein GOS_5022396, partial [mar... 1085 hmmsearch env_nr
1 gi|137512285|gb|EBU49357.1| - cluster1085 - 0.003 25.8 1.2 0.0059 24.9 0.3 ... 0 0 2 2 1 1 hypothetical protein GOS_7106176, partial [mar... 1085 hmmsearch env_nr
2 gi|140630778|gb|ECM44309.1| - cluster1085 - 0.0074 24.6 1.5 0.013 23.8 1.5 ... 0 0 1 1 1 1 hypothetical protein GOS_6399859, partial [mar... 1085 hmmsearch env_nr

5 rows × 22 columns


In [4]:
#Convert all numeric columns to float for manipulation
for col in ["e_value","bitscore","bias"] + [x for x in all_hmmer_hits.columns if ("best_dmn" in x or "dne_" in x )] : 
    all_hmmer_hits[col] = pd.to_numeric(all_hmmer_hits[col])

Keep best hit per database for each cluster

Filtered by e-value < 1e-3 and best domain e-value < 1


In [5]:
gb = all_hmmer_hits[ (all_hmmer_hits.e_value < 1e-3) & (all_hmmer_hits.best_dmn_e_value < 1) ].groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id",
                                                                                   "bitscore","e_value","s_description","best_dmn_e_value"]]

sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.head()


Out[5]:
cluster db tool query_id subject_id bitscore e_value s_description best_dmn_e_value
0 1073 metahit_pep hmmsearch cluster1073 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 156.7 2.600000e-44 [translate_table: standard] 2.600000e-44
0 1085 env_nr hmmsearch cluster1085 gi|140329223|gb|ECL02226.1| 27.6 8.800000e-04 hypothetical protein GOS_5022396, partial [mar... 1.100000e-03
0 1085 hmp_pep hmmsearch cluster1085 HMPREF9973_11022 27.3 9.900000e-04 phage head morphogenesis protein, SPP1 gp7 fam... 1.800000e-03
0 113b metahit_pep hmmsearch cluster113b GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324... 127.3 2.400000e-35 [translate_table: standard] 2.900000e-17
0 1217 metahit_pep hmmsearch cluster1217 GL0006060_O2_[Complete]_[mRNA]_locus=scaffold4... 174.9 5.600000e-50 [translate_table: standard] 8.000000e-50

In [6]:
sorted_fam_hits.to_csv("filtered_hmmer_hits.csv",index=False)