In [1]:
import pandas as pd
import re
from glob import glob

1. Load blast hits


In [2]:
#Load blast hits
blastp_hits = pd.read_csv("2_blastp_hits.csv")
blastp_hits.head()
#Filter out Metahit 2010 hits, keep only Metahit 2014
blastp_hits = blastp_hits[blastp_hits.db != "metahit_pep"]

2. Process blastp results

2.1 Extract ORF stats from fasta file


In [3]:
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orf/d9539_asm_v1.2_orf.fa")
header_regex = re.compile(r">([^ ]+?) \[([0-9]+) - ([0-9]+)\]")
orf_stats = []
for line in fh:
    header_match = header_regex.match(line)
    if header_match:
        is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
        q_id = header_match.group(1)
        #Position in contig
        q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
        q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
        #Length of orf in aminoacids
        q_len = (q_cds_end - q_cds_start + 1) / 3
        orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
                                     index=["q_id","orf_len","q_cds_start","q_cds_end","strand"]))
        
orf_stats_df = pd.DataFrame(orf_stats)
print(orf_stats_df.shape)
orf_stats_df.head()


(52, 5)
Out[3]:
q_id orf_len q_cds_start q_cds_end strand
0 d9539_asm_v1.2_1 27.0 25 105 +
1 d9539_asm_v1.2_2 10.0 98 127 +
2 d9539_asm_v1.2_3 36.0 130 237 +
3 d9539_asm_v1.2_4 52.0 105 260 +
4 d9539_asm_v1.2_5 58.0 215 388 +

In [4]:
#Write orf stats to fasta
orf_stats_df.to_csv("1_orf/orf_stats.csv",index=False)

2.2 Annotate blast hits with orf stats


In [5]:
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov_calc"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) * 100 / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()


Out[5]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end ... q_cov s_description tool db q_id orf_len q_cds_start q_cds_end strand q_cov_calc
157 d9539_asm_v1.2_29 MH0413_GL0135536 93.354 647 43 0 1 647 1 647 ... 100 MH0413_GL0135536 [gene] locus=scaffold143833... blastp metahit_2014_pep d9539_asm_v1.2_29 647.0 3429 5369 + 100.0
158 d9539_asm_v1.2_29 MH0039_GL0018307 93.045 647 45 0 1 647 1 647 ... 100 MH0039_GL0018307 [gene] locus=scaffold66015_... blastp metahit_2014_pep d9539_asm_v1.2_29 647.0 3429 5369 + 100.0
159 d9539_asm_v1.2_29 MH0409_GL0129297 91.963 647 52 0 1 647 1 647 ... 100 MH0409_GL0129297 [gene] locus=scaffold117357... blastp metahit_2014_pep d9539_asm_v1.2_29 647.0 3429 5369 + 100.0
160 d9539_asm_v1.2_29 MH0033_GL0005946 89.352 648 68 1 1 647 1 648 ... 100 MH0033_GL0005946 [gene] locus=scaffold47970_... blastp metahit_2014_pep d9539_asm_v1.2_29 647.0 3429 5369 + 100.0
161 d9539_asm_v1.2_29 MH0136_GL0000112 89.352 648 68 1 1 647 1 648 ... 100 MH0136_GL0000112 [gene] locus=C1259564_1:206... blastp metahit_2014_pep d9539_asm_v1.2_29 647.0 3429 5369 + 100.0

5 rows × 28 columns


In [6]:
assert blastp_hits_annot.shape[0] == blastp_hits.shape[0]

2.3 Extract best hit for each ORF ( q_cov > 0.8 and pct_id > 40% and e-value < 1)

Define these resulting 7 ORFs as the core ORFs for the d9539 assembly.

The homology between the Metahit gene catalogue is very good, and considering the catalogue was curated on a big set of gut metagenomes, it is reasonable to assume that these putative proteins would come from our detected circular putative virus/phage genome

Two extra notes:

  • Additionally, considering only these 7 ORFs , almost the entire genomic region is covered, with very few non-coding regions, still consistent with the hypothesis of a small viral genome which should be mainly coding

  • Also, even though the naive ORF finder detected putative ORFs in both positive and negative strands, the supported ORFs only occur in the positive strand. This could be an indication of a ssDNA or ssRNA virus.


In [7]:
! mkdir -p 4_msa_prots

In [8]:
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 80) & (blastp_hits_annot.pct_id > 40) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
                                                                                      "bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs = reliable_orfs.sort_values(by="q_cds_start",ascending=True)
reliable_orfs


Out[8]:
query_id db subject_id pct_id q_cov q_len bitscore e_value strand q_cds_start q_cds_end
317 d9539_asm_v1.2_1 metahit_2014_pep MH0148_GL0013503 92.593 100 27 57.4 2.310000e-10 + 25 105
147 d9539_asm_v1.2_3 metahit_2014_pep MH0203_GL0004312 90.323 86 36 57.8 1.450000e-10 + 130 237
10 d9539_asm_v1.2_5 metahit_2014_pep MH0409_GL0061975 91.379 100 58 113.0 1.220000e-31 + 215 388
282 d9539_asm_v1.2_11 metahit_2014_pep V1.FI30_GL0086214 91.449 100 421 801.0 0.000000e+00 + 393 1655
261 d9539_asm_v1.2_14 metahit_2014_pep O2.UC35-0_GL0103272 73.333 86 174 221.0 2.390000e-71 + 1603 2124
247 d9539_asm_v1.2_16 metahit_2014_pep DOM013_GL0003401 82.857 100 70 115.0 2.460000e-32 + 2179 2388
205 d9539_asm_v1.2_22 metahit_2014_pep MH0398_GL0192275 90.831 97 361 658.0 0.000000e+00 + 2351 3433
157 d9539_asm_v1.2_29 metahit_2014_pep MH0413_GL0135536 93.354 100 647 1255.0 0.000000e+00 + 3429 5369
124 d9539_asm_v1.2_30 metahit_2014_pep MH0039_GL0018308 98.969 100 97 190.0 5.350000e-61 + 5374 5664

2.4 Extract selected orfs for further analysis


In [9]:
reliable_orfs["orf_id"] = ["orf{}".format(x) for x in range(1,reliable_orfs.shape[0]+1) ]
reliable_orfs["cds_len"] = reliable_orfs["q_cds_end"] - reliable_orfs["q_cds_start"] +1
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_stats.csv",index=False,header=True)
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_list.txt",index=False,header=False,columns=["query_id"])

2.4.2 Extract fasta


In [10]:
! ~/utils/bin/seqtk subseq 1_orf/d9539_asm_v1.2_orf.fa 3_filtered_orfs/filt_orf_list.txt > 3_filtered_orfs/d9539_asm_v1.2_orf_filt.fa

2.4.3 Write out filtered blast hits


In [11]:
filt_blastp_hits = blastp_hits_annot[ blastp_hits_annot.query_id.apply(lambda x: x in reliable_orfs.query_id.tolist())]
filt_blastp_hits.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_blastp.csv")
filt_blastp_hits.head()


Out[11]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end ... q_cov s_description tool db q_id orf_len q_cds_start q_cds_end strand q_cov_calc
10 d9539_asm_v1.2_5 MH0409_GL0061975 91.379 58 5 0 1 58 29 86 ... 100 MH0409_GL0061975 [gene] locus=scaffold117357... blastp metahit_2014_pep d9539_asm_v1.2_5 58.0 215 388 + 100.0
11 d9539_asm_v1.2_5 675950834-stool1_revised_C814893_1_gene71376 89.655 58 6 0 1 58 1 58 ... 100 675950834-stool1_revised_C814893_1_gene71376 s... blastp metahit_2014_pep d9539_asm_v1.2_5 58.0 215 388 + 100.0
12 d9539_asm_v1.2_5 MH0105_GL0047475 89.655 58 6 0 1 58 29 86 ... 100 MH0105_GL0047475 [gene] locus=scaffold1014_1... blastp metahit_2014_pep d9539_asm_v1.2_5 58.0 215 388 + 100.0
13 d9539_asm_v1.2_5 MH0203_GL0004312 89.655 58 6 0 1 58 29 86 ... 100 MH0203_GL0004312 [gene] locus=C2075218_1:153... blastp metahit_2014_pep d9539_asm_v1.2_5 58.0 215 388 + 100.0
14 d9539_asm_v1.2_5 MH0033_GL0005948 89.655 58 6 0 1 58 29 86 ... 100 MH0033_GL0005948 [gene] locus=scaffold47970_... blastp metahit_2014_pep d9539_asm_v1.2_5 58.0 215 388 + 100.0

5 rows × 28 columns

gb = filt_blastp_hits.groupby("query_id") for q_id,hits in gb: for db in ["metahit_pep","hmp_pep","env_nr","nr"]: try: os.mkdir("3_filtered_orfs/msa/{}".format(q_id)) except: pass hits_to_write = hits[hits.db == db]["subject_id"].unique() if hits_to_write.shape != (0,): hits_to_write.tofile("3_filtered_orfs/msa/{0}/{0}_{1}.txt".format(q_id,db),sep="\n")