In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
import itertools
In [2]:
blast_folder_regex = re.compile(r"(blast[np])_vs_([a-zA-Z0-9_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
"gap_open","q_start","q_end","s_start","s_end",
"e_value","bitscore","q_len","s_len","s_gi",
"s_taxids","s_scinames","s_names","q_cov","s_description"
]
blast_hits = []
for folder in glob("../1_run_db_searches/blast_160427/blast?_vs_*"):
tool_id,db_id = blast_folder_regex.search(folder).groups()
for blast_filename in glob(folder+"/*.tsv"):
blast_hits.append( pd.read_csv(blast_filename,sep="\t", header=None, names=blast_cols) )
blast_hits[-1]["cluster"] = cluster_id_regex.search(blast_filename).group(1)
blast_hits[-1]["tool"] = tool_id
blast_hits[-1]["db"] = db_id
# Load new metahit 2014 IGC catalogue hits
for folder in glob("../1_run_db_searches/metahit_2014_160620/blast?_vs_*"):
tool_id,db_id = blast_folder_regex.search(folder).groups()
for blast_filename in glob(folder+"/*.tsv"):
blast_hits.append( pd.read_csv(blast_filename,sep="\t", header=None, names=blast_cols) )
blast_hits[-1]["cluster"] = cluster_id_regex.search(blast_filename).group(1)
blast_hits[-1]["tool"] = tool_id
blast_hits[-1]["db"] = db_id
In [3]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
all_blast_hits = all_blast_hits.append(search_hits)
all_blast_hits.head()
Out[3]:
In [4]:
all_blast_hits.db.unique()
Out[4]:
In [5]:
#all_blast_hits[all_blast_hits.e_value < 0.001].groupby(["cluster","db"])
gb = all_blast_hits[ (all_blast_hits.q_cov > 80) & (all_blast_hits.e_value < 0.001) ].groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id","pct_id","q_cov","q_len",
"bitscore","e_value","s_description"]]
sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.to_csv("1_out/filtered_blast_best_hits.csv",index=False)
sorted_fam_hits.head()
Out[5]:
In [6]:
#Export all "valid" hits for each cluster
all_blast_hits[ (all_blast_hits.q_cov > 80) & (all_blast_hits.e_value < 0.001) ].to_csv("1_out/filtered_blast_all_hits.csv",index=False)
In [ ]: