In [1]:
!ls
In [2]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
In [3]:
blast_file_regex = re.compile(r"(blast[np])_vs_([a-zA-Z_]+).tsv")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
"gap_open","q_start","q_end","s_start","s_end","e_value","bitscore"]
blast_hits = []
for blast_filename in glob("blastn/*.tsv"):
tool_id,db_id = blast_file_regex.search(blast_filename).groups()
blast_hits.append( pd.read_csv(blast_filename,sep="\t",header=None,names=blast_cols) )
blast_hits[-1]["tool"] = tool_id
blast_hits[-1]["db"] = db_id
In [4]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
all_blast_hits = all_blast_hits.append(search_hits)
all_blast_hits.head()
Out[4]:
In [5]:
subj_id_regex= re.compile(r"^gi\|([0-9]+)\|")
def check_gi_in_ncbi(row):
description = None
tax_id = None
subj_len = None
gi = None
gi_match = subj_id_regex.match(row["subject_id"])
if gi_match:
ncbi_db = "nuccore" if row["tool"] == "blastn" else "protein"
gi = gi_match.group(1)
r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db={}&id={}".format(ncbi_db,gi))
if r.status_code == 200:
print(gi)
xml_doc = BeautifulSoup(r.content,"xml")
if not xml_doc.find("Item",Name="Title"):
print(r.content)
else:
description = xml_doc.find("Item",Name="Title").text
tax_id = xml_doc.find("Item",Name="TaxId").text
subj_len = xml_doc.find("Item",Name="Length").text
return pd.Series([gi,description,tax_id,subj_len])
#Run annotation
all_blast_hits[["gi","description","tax_id","s_len"]] = all_blast_hits.apply(check_gi_in_ncbi,axis=1)
In [6]:
all_blast_hits.sort_values(by=["query_id","bitscore"],ascending=False).to_csv("blastn_hits.csv",index=False)
In [ ]: