In [1]:
!ls


2_blast.mak                    d9539_hmp_homologs.txt
3_parse_d9539_blast_hits.ipynb d9539_metahit_homologs.fasta
4_parse_blastn_results.ipynb   d9539_metahit_homologs.txt
blastn                         get_metahit_seqs.sh
d9539_env_nt_homologs.fasta    mauve_msa

In [2]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

1. Aggregate all blast results


In [3]:
blast_file_regex = re.compile(r"(blast[np])_vs_([a-zA-Z_]+).tsv")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
              "gap_open","q_start","q_end","s_start","s_end","e_value","bitscore"]

blast_hits = []
for blast_filename in glob("blastn/*.tsv"):
    tool_id,db_id = blast_file_regex.search(blast_filename).groups()
    blast_hits.append( pd.read_csv(blast_filename,sep="\t",header=None,names=blast_cols) )
    blast_hits[-1]["tool"] = tool_id
    blast_hits[-1]["db"] = db_id

In [4]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
    all_blast_hits = all_blast_hits.append(search_hits)
all_blast_hits.head()


Out[4]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end e_value bitscore tool db
0 d9539_asm_v1.2 gi|935523605|emb|CDTY01044545.1| 89.62 5270.0 532.0 14.0 1.0 5261.0 2725.0 7988.0 0.0 6687.0 blastn env_nt
1 d9539_asm_v1.2 gi|935523605|emb|CDTY01044545.1| 89.29 2726.0 283.0 9.0 3032.0 5751.0 1.0 2723.0 0.0 3408.0 blastn env_nt
2 d9539_asm_v1.2 gi|934409231|emb|CDZF01013522.1| 89.14 5332.0 558.0 19.0 1.0 5318.0 5325.0 1.0 0.0 6619.0 blastn env_nt
3 d9539_asm_v1.2 gi|934409231|emb|CDZF01013522.1| 88.44 770.0 78.0 11.0 4989.0 5751.0 6092.0 5327.0 0.0 918.0 blastn env_nt
4 d9539_asm_v1.2 gi|931877178|emb|CEAD01032396.1| 89.67 4705.0 474.0 11.0 1.0 4699.0 1213.0 5911.0 0.0 5986.0 blastn env_nt

2. Annotate blast results with tax and hit name information extracted from NCBI with eutils


In [5]:
subj_id_regex= re.compile(r"^gi\|([0-9]+)\|")
def check_gi_in_ncbi(row):
    description = None
    tax_id = None
    subj_len = None
    gi = None
    gi_match = subj_id_regex.match(row["subject_id"])
    if gi_match:
        ncbi_db = "nuccore" if row["tool"] == "blastn" else "protein"
        gi = gi_match.group(1)
        r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db={}&id={}".format(ncbi_db,gi))
        if r.status_code == 200:
            print(gi)
            xml_doc = BeautifulSoup(r.content,"xml")
            if not xml_doc.find("Item",Name="Title"):
                print(r.content)
            else:
                description = xml_doc.find("Item",Name="Title").text 
                tax_id = xml_doc.find("Item",Name="TaxId").text 
                subj_len = xml_doc.find("Item",Name="Length").text 
    return pd.Series([gi,description,tax_id,subj_len])

#Run annotation
all_blast_hits[["gi","description","tax_id","s_len"]] = all_blast_hits.apply(check_gi_in_ncbi,axis=1)


935523605
935523605
935523605
934409231
934409231
931877178
931877178
931667140
934040108
782230860
842490291
793554749
132430165

3. Write annotated results out


In [6]:
all_blast_hits.sort_values(by=["query_id","bitscore"],ascending=False).to_csv("blastn_hits.csv",index=False)

In [ ]: