In [1]:
import pandas as pd
import re
from glob import glob
In [4]:
with open("3_filtered_orfs/filt_orf_list.txt") as fh:
filtered_orfs = fh.read().split("\n")[:-1]
filtered_orfs
Out[4]:
In [5]:
tbl_header = ["s_id","s_accession","q_id","q_accession" ,
"e_value","bitscore","bias", "best_dmn_e_value","best_dmn_score","best_dmn_bias",
"dne_exp","dne_reg","dne_clu","dne_ov", "dne_env",
"dne_dom", "dne_rep", "dne_inc", "description"]
domtbl_header = ["s_id","s_accession","s_len","q_id","q_accession" ,
"q_len","e_value","bitscore" ,"bias" ,"dmn_number",
"dmn_total","dmn_c_evalue","dmn_i_evalue","dmn_score","dmn_bias",
"s_start","s_end","ali_start","ali_end","env_start","env_end",
"acc","description"]
In [6]:
def hmmer_parser(filename,header=None,domtbl=False):
col_number = 19 if not domtbl else 23
data_lines = []
with open(filename) as fh:
for line in fh:
line = line.rstrip("\n")
if line.startswith("#") or line == "":
continue
data_lines.append( re.split(r" +",line,maxsplit=col_number-1) )
return pd.DataFrame.from_records( data_lines, columns=header)
In [7]:
pfam_dom = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_PfamA.domtbl",header=domtbl_header,domtbl=True)
pfam_dom["db"] = "pfam"
vfam_dom = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_vFamA.domtbl",header=domtbl_header,domtbl=True)
vfam_dom["db"] = "vfam"
all_dom = pd.concat([pfam_dom,vfam_dom])
In [8]:
pfam_tbl = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_PfamA.tbl",header=tbl_header)
pfam_tbl["db"] = "pfam"
vfam_tbl = hmmer_parser("2_hmmscan/d9539_asm_v1.2_orf_hmmscan_vFamA.tbl",header=tbl_header)
vfam_tbl["db"] = "vfam"
all_tbl = pd.concat([pfam_tbl,vfam_tbl])
In [9]:
#Make columns numeric
for col in ["e_value","bitscore","bias","acc"] + [x for x in all_dom.columns if ("dmn_" in x or "start" in x or "end" in x )] :
all_dom[col] = pd.to_numeric(all_dom[col])
for col in ["e_value","bitscore","bias"] + [x for x in all_tbl.columns if ("best_dmn" in x or "dne_" in x )] :
all_tbl[col] = pd.to_numeric(all_tbl[col])
In [10]:
filt_tbl = all_tbl[all_tbl["q_id"].apply(lambda x: x in filtered_orfs)]
filt_dom = all_dom[all_dom["q_id"].apply(lambda x: x in filtered_orfs)]
In [11]:
filt_dom
Out[11]:
In [12]:
filt_tbl.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_hmmsearch_tbl.csv",index=False)
filt_dom.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_hmmsearch_dom.csv",index=False)
In [ ]: