In [1]:
import pandas as pd
import re
from glob import glob
In [2]:
#Load blast hits
blastp_hits = pd.read_csv("2_blastp_hits.csv")
blastp_hits.head()
#Filter out Metahit 2010 hits, keep only Metahit 2014
blastp_hits = blastp_hits[blastp_hits.db != "metahit_pep"]
In [3]:
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orf/d9539_asm_v1.2_orf.fa")
header_regex = re.compile(r">([^ ]+?) \[([0-9]+) - ([0-9]+)\]")
orf_stats = []
for line in fh:
header_match = header_regex.match(line)
if header_match:
is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
q_id = header_match.group(1)
#Position in contig
q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
#Length of orf in aminoacids
q_len = (q_cds_end - q_cds_start + 1) / 3
orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
index=["q_id","orf_len","q_cds_start","q_cds_end","strand"]))
orf_stats_df = pd.DataFrame(orf_stats)
print(orf_stats_df.shape)
orf_stats_df.head()
Out[3]:
In [4]:
#Write orf stats to fasta
orf_stats_df.to_csv("1_orf/orf_stats.csv",index=False)
In [5]:
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov_calc"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) * 100 / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()
Out[5]:
In [6]:
assert blastp_hits_annot.shape[0] == blastp_hits.shape[0]
Define these resulting 7 ORFs as the core ORFs for the d9539 assembly.
The homology between the Metahit gene catalogue is very good, and considering the catalogue was curated on a big set of gut metagenomes, it is reasonable to assume that these putative proteins would come from our detected circular putative virus/phage genome
Two extra notes:
Additionally, considering only these 7 ORFs , almost the entire genomic region is covered, with very few non-coding regions, still consistent with the hypothesis of a small viral genome which should be mainly coding
Also, even though the naive ORF finder detected putative ORFs in both positive and negative strands, the supported ORFs only occur in the positive strand. This could be an indication of a ssDNA or ssRNA virus.
In [7]:
! mkdir -p 4_msa_prots
In [8]:
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 80) & (blastp_hits_annot.pct_id > 40) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
"bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs = reliable_orfs.sort_values(by="q_cds_start",ascending=True)
reliable_orfs
Out[8]:
In [9]:
reliable_orfs["orf_id"] = ["orf{}".format(x) for x in range(1,reliable_orfs.shape[0]+1) ]
reliable_orfs["cds_len"] = reliable_orfs["q_cds_end"] - reliable_orfs["q_cds_start"] +1
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_stats.csv",index=False,header=True)
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_list.txt",index=False,header=False,columns=["query_id"])
In [10]:
! ~/utils/bin/seqtk subseq 1_orf/d9539_asm_v1.2_orf.fa 3_filtered_orfs/filt_orf_list.txt > 3_filtered_orfs/d9539_asm_v1.2_orf_filt.fa
In [11]:
filt_blastp_hits = blastp_hits_annot[ blastp_hits_annot.query_id.apply(lambda x: x in reliable_orfs.query_id.tolist())]
filt_blastp_hits.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_blastp.csv")
filt_blastp_hits.head()
Out[11]: