In [1]:
!ls


1_find_orfs.mak                         2_run_blast.sh
1_orf                                   3_aggregate_blast_hits.ipynb
2_blast                                 3_filtered_orfs
2_blast.mak                             4_select_reliable_orfs.ipynb
2_hmmscan                               5_extract_reliable_orf_hmmer_hits.ipynb
2_hmmscan.mak                           log

In [2]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

1. Aggregate all blast results


In [5]:
blast_file_regex = re.compile(r"(blast[np])_vs_([a-zA-Z0-9_]+).tsv")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
              "gap_open","q_start","q_end","s_start","s_end",
              "e_value","bitscore","q_len","s_len","s_gi",
             "s_taxids","s_scinames","s_names","q_cov","s_description"
             ]

#blast_cols = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sgi staxids sscinames scomnames qcovs stitle"

In [6]:
blast_hits = []
for blast_filename in glob("2_blast/*.tsv"):
    tool_id,db_id = blast_file_regex.search(blast_filename).groups()
    blast_hits.append( pd.read_csv(blast_filename,sep="\t",header=None,names=blast_cols) )
    blast_hits[-1]["tool"] = tool_id
    blast_hits[-1]["db"] = db_id

In [7]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
    all_blast_hits = all_blast_hits.append(search_hits)
print(all_blast_hits.shape)


(431, 22)

In [8]:
all_blast_hits.head()


Out[8]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end ... q_len s_len s_gi s_taxids s_scinames s_names q_cov s_description tool db
0 d9539_asm_v1.2_4 gi|402685296|gb|EJX08051.1| 36.111 36 23 0 16 51 147 182 ... 52 416 402685296 749906 NaN NaN 69 glycosyl transferase group 1 [gut metagenome] blastp env_nr
1 d9539_asm_v1.2_5 gi|140773177|gb|ECN40753.1| 30.508 59 32 2 1 51 124 181 ... 58 206 140773177 408172 NaN NaN 88 hypothetical protein GOS_6058719, partial [mar... blastp env_nr
2 d9539_asm_v1.2_5 gi|139039289|gb|ECC98867.1| 34.884 43 20 1 1 35 129 171 ... 58 174 139039289 408172 NaN NaN 60 hypothetical protein GOS_5221334, partial [mar... blastp env_nr
3 d9539_asm_v1.2_5 gi|135943891|gb|EBK87411.1| 28.814 59 33 2 1 51 124 181 ... 58 193 135943891 408172 NaN NaN 88 hypothetical protein GOS_8661086, partial [mar... blastp env_nr
4 d9539_asm_v1.2_5 gi|143562029|gb|EDF64666.1| 28.814 59 33 2 1 51 90 147 ... 58 216 143562029 408172 NaN NaN 88 hypothetical protein GOS_918582, partial [mari... blastp env_nr

5 rows × 22 columns

2. Write annotated results out


In [10]:
all_blast_hits.sort_values(by=["query_id","bitscore"],ascending=False).to_csv("2_blastp_hits.csv",index=False)

Old steps, not used anymore

Annotate blast results with tax and hit name information extracted from NCBI with eutils

subj_id_regex= re.compile(r"^gi\|([0-9]+)\|") def check_gi_in_ncbi(row): description = None tax_id = None subj_len = None gi = None gi_match = subj_id_regex.match(row["subject_id"]) if gi_match: ncbi_db = "nuccore" if row["tool"] == "blastn" else "protein" gi = gi_match.group(1) r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db={}&id={}".format(ncbi_db,gi)) if r.status_code == 200: print(gi) xml_doc = BeautifulSoup(r.content,"xml") if not xml_doc.find("Item",Name="Title"): print(r.content) else: description = xml_doc.find("Item",Name="Title").text tax_id = xml_doc.find("Item",Name="TaxId").text subj_len = xml_doc.find("Item",Name="Length").text return pd.Series([gi,description,tax_id,subj_len]) #Run annotation all_blast_hits[["gi","description","tax_id","s_len"]] = all_blast_hits.apply(check_gi_in_ncbi,axis=1)