In [1]:
import pandas as pd
import re
from glob import glob
import itertools

In [2]:
def hmmer_parser(filename,header=None,domtbl=False):
    col_number = 19 if not domtbl else 23
    data_lines = []
    with open(filename) as fh:
        for line in fh:
            line = line.rstrip("\n")
            if line.startswith("#") or line == "":
                continue
            data_lines.append( re.split(r" +",line,maxsplit=col_number-1) ) 
    return pd.DataFrame.from_records( data_lines, columns=header)

In [3]:
hmmer_folder_regex = re.compile(r"hmmer_vs_([a-zA-Z0-9_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
hmmer_tsv_header = ["subject_id","s_accession","query_id","q_accession" ,
                    "e_value","bitscore","bias", "best_dmn_e_value","best_dmn_bitscore","best_dmn_bias",
                    "dne_exp","dne_reg","dne_clu","dne_ov", "dne_env", 
                    "dne_dom", "dne_rep", "dne_inc", "s_description"]

hmmer_hits = []
for folder in glob("../1_run_db_searches/hmmer_160324/hmmer_vs_*"):
    db_id = hmmer_folder_regex.search(folder).group(1)
    for hmmer_filename in glob(folder+"/*.tsv"):
        hmmer_hits.append( hmmer_parser(hmmer_filename,hmmer_tsv_header) )
        hmmer_hits[-1]["cluster"] = cluster_id_regex.search(hmmer_filename).group(1)
        hmmer_hits[-1]["tool"] = "hmmsearch"
        hmmer_hits[-1]["db"] = db_id

#Add hits to metahit 2014 igc
for folder in glob("../1_run_db_searches/metahit_2014_160620/hmmer_vs_*"):
    db_id = hmmer_folder_regex.search(folder).group(1)
    for hmmer_filename in glob(folder+"/*.tsv"):
        hmmer_hits.append( hmmer_parser(hmmer_filename,hmmer_tsv_header) )
        hmmer_hits[-1]["cluster"] = cluster_id_regex.search(hmmer_filename).group(1)
        hmmer_hits[-1]["tool"] = "hmmsearch"
        hmmer_hits[-1]["db"] = db_id
        
        
all_hmmer_hits = pd.concat(hmmer_hits,axis=0)
print(all_hmmer_hits.shape)
all_hmmer_hits.head()


(3346, 22)
Out[3]:
subject_id s_accession query_id q_accession e_value bitscore bias best_dmn_e_value best_dmn_bitscore best_dmn_bias ... dne_clu dne_ov dne_env dne_dom dne_rep dne_inc s_description cluster tool db
0 gi|135090735|gb|EBF43246.1| - cluster1057 - 0.64 18.7 0.7 0.64 18.7 0.7 ... 1 2 4 4 4 0 hypothetical protein GOS_9598556 [marine metag... 1057 hmmsearch env_nr
0 gi|138678869|gb|ECB25512.1| - cluster1073 - 0.38 19.4 0.2 0.46 19.1 0.2 ... 0 0 1 1 1 0 hypothetical protein GOS_5095829, partial [mar... 1073 hmmsearch env_nr
0 gi|140329223|gb|ECL02226.1| - cluster1085 - 0.00088 27.6 2.0 0.0011 27.3 0.7 ... 0 0 2 2 1 1 hypothetical protein GOS_5022396, partial [mar... 1085 hmmsearch env_nr
1 gi|137512285|gb|EBU49357.1| - cluster1085 - 0.003 25.8 1.2 0.0059 24.9 0.3 ... 0 0 2 2 1 1 hypothetical protein GOS_7106176, partial [mar... 1085 hmmsearch env_nr
2 gi|140630778|gb|ECM44309.1| - cluster1085 - 0.0074 24.6 1.5 0.013 23.8 1.5 ... 0 0 1 1 1 1 hypothetical protein GOS_6399859, partial [mar... 1085 hmmsearch env_nr

5 rows × 22 columns


In [4]:
#Convert all numeric columns to float for manipulation
for col in ["e_value","bitscore","bias"] + [x for x in all_hmmer_hits.columns if ("best_dmn" in x or "dne_" in x )] : 
    all_hmmer_hits[col] = pd.to_numeric(all_hmmer_hits[col])

Filter hits


In [5]:
filt_hits = all_hmmer_hits[ (all_hmmer_hits.e_value < 1e-3) & (all_hmmer_hits.best_dmn_e_value < 1e-3) ]
filt_hits.to_csv("1_out/filtered_hmmer_all_hits.csv",index=False)
print(filt_hits.shape)
filt_hits.head()


(1882, 22)
Out[5]:
subject_id s_accession query_id q_accession e_value bitscore bias best_dmn_e_value best_dmn_bitscore best_dmn_bias ... dne_clu dne_ov dne_env dne_dom dne_rep dne_inc s_description cluster tool db
0 gi|816591059|gb|KKL91504.1| - cluster1217 - 8.600000e-15 63.2 0.0 1.900000e-14 62.1 0.0 ... 0 0 1 1 1 1 hypothetical protein LCGC14_1894030 [marine se... 1217 hmmsearch env_nr
1 gi|138620928|gb|ECA86303.1| - cluster1217 - 9.100000e-13 56.7 0.1 2.300000e-12 55.5 0.1 ... 0 0 1 1 1 1 hypothetical protein GOS_3182775, partial [mar... 1217 hmmsearch env_nr
2 gi|141380444|gb|ECR42494.1| - cluster1217 - 4.400000e-12 54.5 0.0 5.500000e-12 54.2 0.0 ... 0 0 1 1 1 1 hypothetical protein GOS_5229309, partial [mar... 1217 hmmsearch env_nr
3 gi|140145789|gb|ECJ82720.1| - cluster1217 - 6.200000e-11 50.8 0.0 8.400000e-11 50.4 0.0 ... 0 0 1 1 1 1 hypothetical protein GOS_6302003, partial [mar... 1217 hmmsearch env_nr
4 gi|139603673|gb|ECG20775.1| - cluster1217 - 2.400000e-09 45.8 0.0 4.400000e-09 44.9 0.0 ... 0 0 1 1 1 1 hypothetical protein GOS_3205514 [marine metag... 1217 hmmsearch env_nr

5 rows × 22 columns

Keep best hit per database for each cluster

Filtered by e-value < 1e-3 and best domain e-value < 1


In [6]:
gb = filt_hits.groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id",
                                                                                   "bitscore","e_value","s_description","best_dmn_e_value"]]

sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.to_csv("1_out/filtered_hmmer_best_hits.csv",index=False)
print(sorted_fam_hits.shape)
sorted_fam_hits.head()


(51, 9)
Out[6]:
cluster db tool query_id subject_id bitscore e_value s_description best_dmn_e_value
0 1073 metahit_pep hmmsearch cluster1073 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 156.7 2.600000e-44 [translate_table: standard] 2.600000e-44
0 1073 metahit_2014_pep hmmsearch cluster1073 V1.CD6-0-PT_GL0032215 156.6 8.100000e-44 [gene] locus=scaffold127614_1:21715:22278:+ [... 8.100000e-44
0 1085 metahit_2014_pep hmmsearch cluster1085 160218816-stool1_revised_C882654_1_gene76494 31.7 6.600000e-05 strand:+ start:8853 stop:9821 length:969 star... 1.500000e-04
0 113b metahit_2014_pep hmmsearch cluster113b MH0023_GL0006969 135.0 2.800000e-37 [gene] locus=C761746_1:251:1819:- [Complete] ... 3.100000e-18
0 113b metahit_pep hmmsearch cluster113b GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324... 127.3 2.400000e-35 [translate_table: standard] 2.900000e-17

In [ ]: