In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
import itertools

In [2]:
blast_folder_regex = re.compile(r"(blast[np])_vs_([a-zA-Z0-9_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
              "gap_open","q_start","q_end","s_start","s_end",
              "e_value","bitscore","q_len","s_len","s_gi",
             "s_taxids","s_scinames","s_names","q_cov","s_description"
             ]

blast_hits = []
for folder in glob("../1_run_db_searches/blast_160427/blast?_vs_*"):
    tool_id,db_id = blast_folder_regex.search(folder).groups()
    for blast_filename in glob(folder+"/*.tsv"):
        blast_hits.append( pd.read_csv(blast_filename,sep="\t", header=None, names=blast_cols) )
        blast_hits[-1]["cluster"] = cluster_id_regex.search(blast_filename).group(1)
        blast_hits[-1]["tool"] = tool_id
        blast_hits[-1]["db"] = db_id
        
# Load new metahit 2014 IGC catalogue hits        
for folder in glob("../1_run_db_searches/metahit_2014_160620/blast?_vs_*"):
    tool_id,db_id = blast_folder_regex.search(folder).groups()
    for blast_filename in glob(folder+"/*.tsv"):
        blast_hits.append( pd.read_csv(blast_filename,sep="\t", header=None, names=blast_cols) )
        blast_hits[-1]["cluster"] = cluster_id_regex.search(blast_filename).group(1)
        blast_hits[-1]["tool"] = tool_id
        blast_hits[-1]["db"] = db_id

Combine all blast hits into a single dataframe


In [3]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
    all_blast_hits = all_blast_hits.append(search_hits)
all_blast_hits.head()


Out[3]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end ... s_len s_gi s_taxids s_scinames s_names q_cov s_description cluster tool db
0 GB3LKKR01DOS1W gi|935719420|emb|CEAZ01012945.1| 86.4 228.0 27.0 4.0 46.0 271.0 20508.0 20283.0 ... 144341.0 935719420.0 749906 NaN NaN 83.0 gut metagenome genome assembly P6C7-k21-2014-0... 1073 blastn env_nt
1 GB3LKKR01DOS1W gi|935719420|emb|CEAZ01012945.1| 86.4 228.0 27.0 4.0 46.0 271.0 107376.0 107151.0 ... 144341.0 935719420.0 749906 NaN NaN 83.0 gut metagenome genome assembly P6C7-k21-2014-0... 1073 blastn env_nt
2 GB3LKKR01DOS1W gi|935454047|emb|CEAX01018485.1| 86.4 228.0 27.0 4.0 46.0 271.0 12216.0 12441.0 ... 28297.0 935454047.0 749906 NaN NaN 83.0 gut metagenome genome assembly P6C90-k21-2014-... 1073 blastn env_nt
3 GB3LKKR01DOS1W gi|935344953|emb|CEBY01021626.1| 86.4 228.0 27.0 4.0 46.0 271.0 74375.0 74150.0 ... 86997.0 935344953.0 749906 NaN NaN 83.0 gut metagenome genome assembly P6C0-k21-2014-0... 1073 blastn env_nt
4 GB3LKKR01DOS1W gi|935324036|emb|CEBY01034087.1| 86.4 228.0 27.0 4.0 46.0 271.0 12333.0 12558.0 ... 42116.0 935324036.0 749906 NaN NaN 83.0 gut metagenome genome assembly P6C0-k21-2014-0... 1073 blastn env_nt

5 rows × 23 columns


In [4]:
all_blast_hits.db.unique()


Out[4]:
array(['env_nt', 'metahit_cds', 'env_nr', 'hmp_pep', 'metahit_pep', 'nr',
       'metahit_2014_cds', 'metahit_2014_pep'], dtype=object)

Extract the best hits for each cluster from each DB (q_cov > 80 and e_value < 1e-3 )


In [5]:
#all_blast_hits[all_blast_hits.e_value < 0.001].groupby(["cluster","db"])
gb = all_blast_hits[ (all_blast_hits.q_cov > 80) & (all_blast_hits.e_value < 0.001) ].groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id","pct_id","q_cov","q_len",
                                                                                   "bitscore","e_value","s_description"]]

sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.to_csv("1_out/filtered_blast_best_hits.csv",index=False)
sorted_fam_hits.head()


Out[5]:
cluster db tool query_id subject_id pct_id q_cov q_len bitscore e_value s_description
34 1073 metahit_2014_cds blastn GB3LKKR01A150I N074A_GL0017322 98.560 100.0 276.0 488.0 1.000000e-135 N074A_GL0017322 [gene] locus=scaffold30187_1...
33 1073 env_nt blastn GB3LKKR01A150I gi|936108378|emb|CEAB01076172.1| 98.180 99.0 276.0 479.0 1.000000e-131 gut metagenome genome assembly P2E0-k21-2014-0...
3 1073 metahit_cds blastn contig07331 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 95.971 96.0 285.0 444.0 1.010000e-122 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
30 1073 metahit_pep blastp contig07331 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 97.727 96.0 92.0 159.0 3.240000e-48 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
10 1073 metahit_2014_pep blastp GB3LKKR01CLC55 V1.CD14-0_GL0044222 100.000 84.0 82.0 133.0 8.000000e-38 V1.CD14-0_GL0044222 [gene] locus=scaffold182...

In [6]:
#Export all "valid" hits for each cluster
all_blast_hits[ (all_blast_hits.q_cov > 80) & (all_blast_hits.e_value < 0.001) ].to_csv("1_out/filtered_blast_all_hits.csv",index=False)

In [ ]: