In [1]:

    
!ls









    



1_find_orfs.mak                         2_run_blast.sh
1_orf                                   3_aggregate_blast_hits.ipynb
2_blast                                 3_filtered_orfs
2_blast.mak                             4_select_reliable_orfs.ipynb
2_hmmscan                               5_extract_reliable_orf_hmmer_hits.ipynb
2_hmmscan.mak                           log



In [2]:

    
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

1. Aggregate all blast results



In [5]:

    
blast_file_regex = re.compile(r"(blast[np])_vs_([a-zA-Z0-9_]+).tsv")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
              "gap_open","q_start","q_end","s_start","s_end",
              "e_value","bitscore","q_len","s_len","s_gi",
             "s_taxids","s_scinames","s_names","q_cov","s_description"
             ]

#blast_cols = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sgi staxids sscinames scomnames qcovs stitle"



In [6]:

    
blast_hits = []
for blast_filename in glob("2_blast/*.tsv"):
    tool_id,db_id = blast_file_regex.search(blast_filename).groups()
    blast_hits.append( pd.read_csv(blast_filename,sep="\t",header=None,names=blast_cols) )
    blast_hits[-1]["tool"] = tool_id
    blast_hits[-1]["db"] = db_id



In [7]:

    
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
    all_blast_hits = all_blast_hits.append(search_hits)
print(all_blast_hits.shape)



In [8]:

    
all_blast_hits.head()









    Out[8]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      mism
      gap_open
      q_start
      q_end
      s_start
      s_end
      ...
      q_len
      s_len
      s_gi
      s_taxids
      s_scinames
      s_names
      q_cov
      s_description
      tool
      db
    
  
  
    
      0
      d9539_asm_v1.2_4
      gi|402685296|gb|EJX08051.1|
      36.111
      36
      23
      0
      16
      51
      147
      182
      ...
      52
      416
      402685296
      749906
      NaN
      NaN
      69
      glycosyl transferase group 1 [gut metagenome]
      blastp
      env_nr
    
    
      1
      d9539_asm_v1.2_5
      gi|140773177|gb|ECN40753.1|
      30.508
      59
      32
      2
      1
      51
      124
      181
      ...
      58
      206
      140773177
      408172
      NaN
      NaN
      88
      hypothetical protein GOS_6058719, partial [mar...
      blastp
      env_nr
    
    
      2
      d9539_asm_v1.2_5
      gi|139039289|gb|ECC98867.1|
      34.884
      43
      20
      1
      1
      35
      129
      171
      ...
      58
      174
      139039289
      408172
      NaN
      NaN
      60
      hypothetical protein GOS_5221334, partial [mar...
      blastp
      env_nr
    
    
      3
      d9539_asm_v1.2_5
      gi|135943891|gb|EBK87411.1|
      28.814
      59
      33
      2
      1
      51
      124
      181
      ...
      58
      193
      135943891
      408172
      NaN
      NaN
      88
      hypothetical protein GOS_8661086, partial [mar...
      blastp
      env_nr
    
    
      4
      d9539_asm_v1.2_5
      gi|143562029|gb|EDF64666.1|
      28.814
      59
      33
      2
      1
      51
      90
      147
      ...
      58
      216
      143562029
      408172
      NaN
      NaN
      88
      hypothetical protein GOS_918582, partial [mari...
      blastp
      env_nr
    
  

5 rows × 22 columns

2. Write annotated results out



In [10]:

    
all_blast_hits.sort_values(by=["query_id","bitscore"],ascending=False).to_csv("2_blastp_hits.csv",index=False)

Old steps, not used anymore

Annotate blast results with tax and hit name information extracted from NCBI with eutils

subj_id_regex= re.compile(r"^gi\|([0-9]+)\|") def check_gi_in_ncbi(row): description = None tax_id = None subj_len = None gi = None gi_match = subj_id_regex.match(row["subject_id"]) if gi_match: ncbi_db = "nuccore" if row["tool"] == "blastn" else "protein" gi = gi_match.group(1) r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db={}&id={}".format(ncbi_db,gi)) if r.status_code == 200: print(gi) xml_doc = BeautifulSoup(r.content,"xml") if not xml_doc.find("Item",Name="Title"): print(r.content) else: description = xml_doc.find("Item",Name="Title").text tax_id = xml_doc.find("Item",Name="TaxId").text subj_len = xml_doc.find("Item",Name="Length").text return pd.Series([gi,description,tax_id,subj_len]) #Run annotation all_blast_hits[["gi","description","tax_id","s_len"]] = all_blast_hits.apply(check_gi_in_ncbi,axis=1)

	query_id	subject_id	pct_id	ali_len	mism	gap_open	q_start	q_end	s_start	s_end	...	q_len	s_len	s_gi	s_taxids	s_scinames	s_names	q_cov	s_description	tool	db
0	d9539_asm_v1.2_4	gi\|402685296\|gb\|EJX08051.1\|	36.111	36	23	0	16	51	147	182	...	52	416	402685296	749906	NaN	NaN	69	glycosyl transferase group 1 [gut metagenome]	blastp	env_nr
1	d9539_asm_v1.2_5	gi\|140773177\|gb\|ECN40753.1\|	30.508	59	32	2	1	51	124	181	...	58	206	140773177	408172	NaN	NaN	88	hypothetical protein GOS_6058719, partial [mar...	blastp	env_nr
2	d9539_asm_v1.2_5	gi\|139039289\|gb\|ECC98867.1\|	34.884	43	20	1	1	35	129	171	...	58	174	139039289	408172	NaN	NaN	60	hypothetical protein GOS_5221334, partial [mar...	blastp	env_nr
3	d9539_asm_v1.2_5	gi\|135943891\|gb\|EBK87411.1\|	28.814	59	33	2	1	51	124	181	...	58	193	135943891	408172	NaN	NaN	88	hypothetical protein GOS_8661086, partial [mar...	blastp	env_nr
4	d9539_asm_v1.2_5	gi\|143562029\|gb\|EDF64666.1\|	28.814	59	33	2	1	51	90	147	...	58	216	143562029	408172	NaN	NaN	88	hypothetical protein GOS_918582, partial [mari...	blastp	env_nr