In [1]:
import pandas as pd
import re
from glob import glob

1. Load filtered ORF list


In [4]:
with open("3_filtered_orfs/filt_orf_list.txt") as fh:
    filtered_orfs = fh.read().split("\n")[:-1]
filtered_orfs


Out[4]:
['d9539_asm_v1.2_1',
 'd9539_asm_v1.2_3',
 'd9539_asm_v1.2_5',
 'd9539_asm_v1.2_11',
 'd9539_asm_v1.2_14',
 'd9539_asm_v1.2_16',
 'd9539_asm_v1.2_22',
 'd9539_asm_v1.2_29',
 'd9539_asm_v1.2_30']

2. Parse hmmer output


In [5]:
tbl_header = ["s_id","s_accession","q_id","q_accession" ,
              "e_value","bitscore","bias", "best_dmn_e_value","best_dmn_score","best_dmn_bias",
              "dne_exp","dne_reg","dne_clu","dne_ov", "dne_env", 
              "dne_dom", "dne_rep", "dne_inc", "description"]

domtbl_header = ["s_id","s_accession","s_len","q_id","q_accession" ,
                 "q_len","e_value","bitscore" ,"bias" ,"dmn_number",
                 "dmn_total","dmn_c_evalue","dmn_i_evalue","dmn_score","dmn_bias",
                 "s_start","s_end","ali_start","ali_end","env_start","env_end",
                 "acc","description"]

In [6]:
def hmmer_parser(filename,header=None,domtbl=False):
    col_number = 19 if not domtbl else 23
    data_lines = []
    with open(filename) as fh:
        for line in fh:
            line = line.rstrip("\n")
            if line.startswith("#") or line == "":
                continue
            data_lines.append( re.split(r" +",line,maxsplit=col_number-1) ) 
    return pd.DataFrame.from_records( data_lines, columns=header)

In [7]:
pfam_dom = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_PfamA.domtbl",header=domtbl_header,domtbl=True)
pfam_dom["db"] = "pfam"
vfam_dom = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_vFamA.domtbl",header=domtbl_header,domtbl=True)
vfam_dom["db"] = "vfam"
all_dom = pd.concat([pfam_dom,vfam_dom])

In [8]:
pfam_tbl = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_PfamA.tbl",header=tbl_header)
pfam_tbl["db"] = "pfam"
vfam_tbl = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_vFamA.tbl",header=tbl_header)
vfam_tbl["db"] = "vfam"
all_tbl = pd.concat([pfam_tbl,vfam_tbl])

In [9]:
#Make columns numeric
for col in ["e_value","bitscore","bias","acc"] + [x for x in all_dom.columns if ("dmn_" in x or "start" in x or "end" in x )] : 
    all_dom[col] = pd.to_numeric(all_dom[col])
    
for col in ["e_value","bitscore","bias"] + [x for x in all_tbl.columns if ("best_dmn" in x or "dne_" in x )] : 
    all_tbl[col] = pd.to_numeric(all_tbl[col])

3. Keep hits only for filtered ORFs


In [10]:
filt_tbl = all_tbl[all_tbl["q_id"].apply(lambda x: x in filtered_orfs)]
filt_dom = all_dom[all_dom["q_id"].apply(lambda x: x in filtered_orfs)]

In [11]:
filt_dom


Out[11]:
s_id s_accession s_len q_id q_accession q_len e_value bitscore bias dmn_number ... dmn_bias s_start s_end ali_start ali_end env_start env_end acc description db
8 Phage_F PF02305.14 510 d9539_asm_v1.2_29 - 647 3.400000e-08 32.6 0.1 1 ... 0.1 25 114 29 117 13 141 0.83 Capsid protein (F protein) pfam
9 Phage_F PF02305.14 510 d9539_asm_v1.2_29 - 647 3.400000e-08 32.6 0.1 2 ... 0.0 255 441 375 564 324 632 0.82 Capsid protein (F protein) pfam
10 Big_4 PF07532.8 59 d9539_asm_v1.2_29 - 647 1.300000e+00 8.7 4.4 1 ... 0.5 36 52 188 205 187 206 0.92 Bacterial Ig-like domain (group 4) pfam
11 Big_4 PF07532.8 59 d9539_asm_v1.2_29 - 647 1.300000e+00 8.7 4.4 2 ... 0.2 31 56 319 344 309 346 0.85 Bacterial Ig-like domain (group 4) pfam
6 vFam_4031 - 188 d9539_asm_v1.2_14 - 174 1.500000e-02 13.2 0.2 1 ... 0.4 124 177 19 68 9 76 0.72 - vfam
7 vFam_4031 - 188 d9539_asm_v1.2_14 - 174 1.500000e-02 13.2 0.2 2 ... 0.0 95 130 75 110 60 117 0.81 - vfam

6 rows × 24 columns


In [12]:
filt_tbl.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_hmmsearch_tbl.csv",index=False)
filt_dom.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_hmmsearch_dom.csv",index=False)
#filter hits gb = all_hmmer_hits[ (all_hmmer_hits.e_value < 1e-3) & (all_hmmer_hits.best_dmn_e_value < 1) ].groupby(["cluster","db"]) reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id", "bitscore","e_value","s_description","best_dmn_e_value"]] sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") ) sorted_fam_hits.head()

In [ ]: