In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
In [2]:
#Load blast hits
all_blast_hits = pd.read_csv("all_blast_hits.csv")
#Separate blastn from blastp hits
blastn_hits = all_blast_hits[all_blast_hits.tool == "blastn"]
blastp_hits = all_blast_hits[all_blast_hits.tool == "blastp"]
In [3]:
#List of sequences to extract
seqs_for_msa = blastn_hits[blastn_hits.db == "env_nt"].sort_values(by="ali_len",ascending=False).head(n=10)
#Export megahit ids to extract directly from fasta : Empty!
#seqs_for_msa[seqs_for_msa.db == "hmp_nuc"]["subject_id"].to_csv("d9539_hmp_homologs.txt",sep="\t",index=False,header=False)
Obtain fastas for env_nt homologs from eutils
In [4]:
#Use efetch to extract and save to a file the fsta with the sequences
gis_to_get = ",".join(set(str(x) for x in seqs_for_msa[seqs_for_msa.db == "env_nt"]["gi"]))
r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=fasta&retmode=text".format(gis_to_get))
with open("3_assembly_msa/d9539_env_nt_homologs.fa","w") as env_nt_fh:
env_nt_fh.write(str(r.content.decode()))
In [5]:
#Remove whitespace between seqs
!sed "/^$/d" 3_assembly_msa/d9539_env_nt_homologs.fa > 3_assembly_msa/d9539_env_nt_homologs.fasta
!rm 3_assembly_msa/d9539_env_nt_homologs.fa
In [6]:
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orfs/d9539_asm_v1.2_emboss_orf1.fa")
header_regex = re.compile(r">(.+_[0-9]+?) \[([0-9]+) - ([0-9]+)\] ")
orf_stats = []
for line in fh:
header_match = header_regex.match(line)
if header_match:
is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
q_id = header_match.group(1)
#Position in contig
q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
#Length of orf in aminoacids
q_len = (q_cds_end - q_cds_start + 1) / 3
orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
index=["q_id","q_len","q_cds_start","q_cds_end","strand"]))
orf_stats_df = pd.DataFrame(orf_stats)
orf_stats_df.head()
Out[6]:
In [7]:
#Write orf stats to fasta
orf_stats_df.to_csv("orf_stats.csv",index=False)
In [8]:
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()
Out[8]:
In [9]:
! mkdir -p 4_msa_prots
In [12]:
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 0.8) & (blastp_hits_annot.pct_id > 30) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
"bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("4_msa_prots/filtered_orfs_by_hits.csv",index=False)
reliable_orfs.sort_values(by="q_cds_start",ascending=True)
Out[12]:
In [ ]:
glob("2_hmmscan/*.tbl")