In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

Load blast hits


In [2]:
#Load blast hits
all_blast_hits = pd.read_csv("all_blast_hits.csv")
#Separate blastn from blastp hits
blastn_hits = all_blast_hits[all_blast_hits.tool == "blastn"]
blastp_hits = all_blast_hits[all_blast_hits.tool == "blastp"]

1. Analyze blastn hits

1.1 Extract best env_nt hits to perform a genome MSA

The main goal is to perform an MSA between our D9539 assembly and the similar seqs in the dbs


In [3]:
#List of sequences to extract
seqs_for_msa = blastn_hits[blastn_hits.db == "env_nt"].sort_values(by="ali_len",ascending=False).head(n=10)
#Export megahit ids to extract directly from fasta : Empty!
#seqs_for_msa[seqs_for_msa.db == "hmp_nuc"]["subject_id"].to_csv("d9539_hmp_homologs.txt",sep="\t",index=False,header=False)

Obtain fastas for env_nt homologs from eutils


In [4]:
#Use efetch to extract and save to a file the fsta with the sequences
gis_to_get = ",".join(set(str(x) for x in seqs_for_msa[seqs_for_msa.db == "env_nt"]["gi"]))
r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=fasta&retmode=text".format(gis_to_get))
with open("3_assembly_msa/d9539_env_nt_homologs.fa","w") as env_nt_fh:
    env_nt_fh.write(str(r.content.decode()))

In [5]:
#Remove whitespace between seqs
!sed "/^$/d" 3_assembly_msa/d9539_env_nt_homologs.fa > 3_assembly_msa/d9539_env_nt_homologs.fasta
!rm 3_assembly_msa/d9539_env_nt_homologs.fa

2. Process blastp results

2.1 Extract ORF stats from fasta file


In [6]:
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orfs/d9539_asm_v1.2_emboss_orf1.fa")
header_regex = re.compile(r">(.+_[0-9]+?) \[([0-9]+) - ([0-9]+)\] ")
orf_stats = []
for line in fh:
    header_match = header_regex.match(line)
    if header_match:
        is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
        q_id = header_match.group(1)
        #Position in contig
        q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
        q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
        #Length of orf in aminoacids
        q_len = (q_cds_end - q_cds_start + 1) / 3
        orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
                                     index=["q_id","q_len","q_cds_start","q_cds_end","strand"]))
        
orf_stats_df = pd.DataFrame(orf_stats)
orf_stats_df.head()


Out[6]:
q_id q_len q_cds_start q_cds_end strand
0 ctg_cut_pos_2072_1 27 25 105 +
1 ctg_cut_pos_2072_2 10 98 127 +
2 ctg_cut_pos_2072_3 36 130 237 +
3 ctg_cut_pos_2072_4 52 105 260 +
4 ctg_cut_pos_2072_5 58 215 388 +

In [7]:
#Write orf stats to fasta
orf_stats_df.to_csv("orf_stats.csv",index=False)

2.2 Annotate blast hits with orf stats


In [8]:
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()


Out[8]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end ... gi description tax_id s_len q_id q_len q_cds_start q_cds_end strand q_cov
98 ctg_cut_pos_2072_29 GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff... 93.04 647 45 0 1 647 1 647 ... NaN NaN NaN NaN ctg_cut_pos_2072_29 647 3429 5369 + 1.000000
99 ctg_cut_pos_2072_29 GL0107162_V1_[Complete]_[mRNA]_locus=scaffold7... 87.81 648 78 1 1 647 1 648 ... NaN NaN NaN NaN ctg_cut_pos_2072_29 647 3429 5369 + 1.000000
100 ctg_cut_pos_2072_29 GL0033034_MH0077_[Lack_both_end]_[mRNA]_locus=... 86.33 439 59 1 170 607 1 439 ... NaN NaN NaN NaN ctg_cut_pos_2072_29 647 3429 5369 + 0.676971
220 ctg_cut_pos_2072_11 GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff... 93.11 421 29 0 1 421 1 421 ... NaN NaN NaN NaN ctg_cut_pos_2072_11 421 393 1655 + 1.000000
221 ctg_cut_pos_2072_11 GL0107158_V1_[Complete]_[mRNA]_locus=scaffold7... 91.45 421 36 0 1 421 1 421 ... NaN NaN NaN NaN ctg_cut_pos_2072_11 421 393 1655 + 1.000000

5 rows × 24 columns

2.3 Extract best hit for each ORF ( q_cov > 0.8 and pct_id > 30% and e-value < 1)


In [9]:
! mkdir -p 4_msa_prots

In [12]:
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 0.8) & (blastp_hits_annot.pct_id > 30) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
                                                                                      "bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("4_msa_prots/filtered_orfs_by_hits.csv",index=False)
reliable_orfs.sort_values(by="q_cds_start",ascending=True)


Out[12]:
query_id db subject_id pct_id q_cov q_len bitscore e_value strand q_cds_start q_cds_end
4 ctg_cut_pos_2072_5 metahit_pep GL0012681_MH0069_[Lack_3'-end]_[mRNA]_locus=C6... 50.00 0.827586 58 55.5 8.000000e-10 + 215 388
220 ctg_cut_pos_2072_11 metahit_pep GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff... 93.11 1.000000 421 790.0 0.000000e+00 + 393 1655
200 ctg_cut_pos_2072_14 metahit_pep GL0047932_O2_[Complete]_[mRNA]_locus=scaffold3... 66.47 0.994253 174 226.0 3.000000e-73 + 1603 2124
190 ctg_cut_pos_2072_16 metahit_pep GL0027811_MH0033_[Complete]_[mRNA]_locus=C3777... 77.14 1.000000 70 111.0 3.000000e-31 + 2179 2388
143 ctg_cut_pos_2072_22 metahit_pep GL0027812_MH0033_[Complete]_[mRNA]_locus=C3777... 91.92 0.925208 361 637.0 0.000000e+00 + 2351 3433
98 ctg_cut_pos_2072_29 metahit_pep GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff... 93.04 1.000000 647 1251.0 0.000000e+00 + 3429 5369
76 ctg_cut_pos_2072_30 metahit_pep GL0127673_MH0014_[Complete]_[mRNA]_locus=scaff... 96.91 1.000000 97 187.0 3.000000e-60 + 5374 5664

5.4 Extract all hits for filtered ORF list ( q_cov > 0.8 and pct_id > 30% )

gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 0.8) & (blastp_hits_annot.pct_id > 30) ].groupby("query_id") for q_id,hits in gb: for db in ["metahit_pep","hmp_pep","env_nr","nr"]: try: os.mkdir("4_msa_prots/{}".format(q_id)) except: pass hits_to_write = hits[hits.db == db]["subject_id"].unique() if hits_to_write.shape != (0,): hits_to_write.tofile("4_msa_prots/{0}/{0}_{1}.txt".format(q_id,db),sep="\n")

In [ ]:
glob("2_hmmscan/*.tbl")