In [1]:

    
import pandas as pd
import re
from glob import glob

1. Load blast hits



In [2]:

    
#Load blast hits
blastp_hits = pd.read_csv("2_blastp_hits.csv")
blastp_hits.head()
#Filter out Metahit 2010 hits, keep only Metahit 2014
blastp_hits = blastp_hits[blastp_hits.db != "metahit_pep"]

2. Process blastp results

2.1 Extract ORF stats from fasta file



In [3]:

    
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orf/d9539_asm_v1.2_orf.fa")
header_regex = re.compile(r">([^ ]+?) \[([0-9]+) - ([0-9]+)\]")
orf_stats = []
for line in fh:
    header_match = header_regex.match(line)
    if header_match:
        is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
        q_id = header_match.group(1)
        #Position in contig
        q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
        q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
        #Length of orf in aminoacids
        q_len = (q_cds_end - q_cds_start + 1) / 3
        orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
                                     index=["q_id","orf_len","q_cds_start","q_cds_end","strand"]))
        
orf_stats_df = pd.DataFrame(orf_stats)
print(orf_stats_df.shape)
orf_stats_df.head()









    



(52, 5)






    Out[3]:






  
    
      
      q_id
      orf_len
      q_cds_start
      q_cds_end
      strand
    
  
  
    
      0
      d9539_asm_v1.2_1
      27.0
      25
      105
      +
    
    
      1
      d9539_asm_v1.2_2
      10.0
      98
      127
      +
    
    
      2
      d9539_asm_v1.2_3
      36.0
      130
      237
      +
    
    
      3
      d9539_asm_v1.2_4
      52.0
      105
      260
      +
    
    
      4
      d9539_asm_v1.2_5
      58.0
      215
      388
      +



In [4]:

    
#Write orf stats to fasta
orf_stats_df.to_csv("1_orf/orf_stats.csv",index=False)

2.2 Annotate blast hits with orf stats



In [5]:

    
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov_calc"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) * 100 / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()









    Out[5]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      mism
      gap_open
      q_start
      q_end
      s_start
      s_end
      ...
      q_cov
      s_description
      tool
      db
      q_id
      orf_len
      q_cds_start
      q_cds_end
      strand
      q_cov_calc
    
  
  
    
      157
      d9539_asm_v1.2_29
      MH0413_GL0135536
      93.354
      647
      43
      0
      1
      647
      1
      647
      ...
      100
      MH0413_GL0135536  [gene]  locus=scaffold143833...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_29
      647.0
      3429
      5369
      +
      100.0
    
    
      158
      d9539_asm_v1.2_29
      MH0039_GL0018307
      93.045
      647
      45
      0
      1
      647
      1
      647
      ...
      100
      MH0039_GL0018307  [gene]  locus=scaffold66015_...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_29
      647.0
      3429
      5369
      +
      100.0
    
    
      159
      d9539_asm_v1.2_29
      MH0409_GL0129297
      91.963
      647
      52
      0
      1
      647
      1
      647
      ...
      100
      MH0409_GL0129297  [gene]  locus=scaffold117357...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_29
      647.0
      3429
      5369
      +
      100.0
    
    
      160
      d9539_asm_v1.2_29
      MH0033_GL0005946
      89.352
      648
      68
      1
      1
      647
      1
      648
      ...
      100
      MH0033_GL0005946  [gene]  locus=scaffold47970_...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_29
      647.0
      3429
      5369
      +
      100.0
    
    
      161
      d9539_asm_v1.2_29
      MH0136_GL0000112
      89.352
      648
      68
      1
      1
      647
      1
      648
      ...
      100
      MH0136_GL0000112  [gene]  locus=C1259564_1:206...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_29
      647.0
      3429
      5369
      +
      100.0
    
  

5 rows × 28 columns



In [6]:

    
assert blastp_hits_annot.shape[0] == blastp_hits.shape[0]

2.3 Extract best hit for each ORF ( q_cov > 0.8 and pct_id > 40% and e-value < 1)

Define these resulting 7 ORFs as the core ORFs for the d9539 assembly.

The homology between the Metahit gene catalogue is very good, and considering the catalogue was curated on a big set of gut metagenomes, it is reasonable to assume that these putative proteins would come from our detected circular putative virus/phage genome

Two extra notes:

Additionally, considering only these 7 ORFs , almost the entire genomic region is covered, with very few non-coding regions, still consistent with the hypothesis of a small viral genome which should be mainly coding
Also, even though the naive ORF finder detected putative ORFs in both positive and negative strands, the supported ORFs only occur in the positive strand. This could be an indication of a ssDNA or ssRNA virus.



In [7]:

    
! mkdir -p 4_msa_prots



In [8]:

    
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 80) & (blastp_hits_annot.pct_id > 40) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
                                                                                      "bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs = reliable_orfs.sort_values(by="q_cds_start",ascending=True)
reliable_orfs









    Out[8]:






  
    
      
      query_id
      db
      subject_id
      pct_id
      q_cov
      q_len
      bitscore
      e_value
      strand
      q_cds_start
      q_cds_end
    
  
  
    
      317
      d9539_asm_v1.2_1
      metahit_2014_pep
      MH0148_GL0013503
      92.593
      100
      27
      57.4
      2.310000e-10
      +
      25
      105
    
    
      147
      d9539_asm_v1.2_3
      metahit_2014_pep
      MH0203_GL0004312
      90.323
      86
      36
      57.8
      1.450000e-10
      +
      130
      237
    
    
      10
      d9539_asm_v1.2_5
      metahit_2014_pep
      MH0409_GL0061975
      91.379
      100
      58
      113.0
      1.220000e-31
      +
      215
      388
    
    
      282
      d9539_asm_v1.2_11
      metahit_2014_pep
      V1.FI30_GL0086214
      91.449
      100
      421
      801.0
      0.000000e+00
      +
      393
      1655
    
    
      261
      d9539_asm_v1.2_14
      metahit_2014_pep
      O2.UC35-0_GL0103272
      73.333
      86
      174
      221.0
      2.390000e-71
      +
      1603
      2124
    
    
      247
      d9539_asm_v1.2_16
      metahit_2014_pep
      DOM013_GL0003401
      82.857
      100
      70
      115.0
      2.460000e-32
      +
      2179
      2388
    
    
      205
      d9539_asm_v1.2_22
      metahit_2014_pep
      MH0398_GL0192275
      90.831
      97
      361
      658.0
      0.000000e+00
      +
      2351
      3433
    
    
      157
      d9539_asm_v1.2_29
      metahit_2014_pep
      MH0413_GL0135536
      93.354
      100
      647
      1255.0
      0.000000e+00
      +
      3429
      5369
    
    
      124
      d9539_asm_v1.2_30
      metahit_2014_pep
      MH0039_GL0018308
      98.969
      100
      97
      190.0
      5.350000e-61
      +
      5374
      5664

2.4 Extract selected orfs for further analysis



In [9]:

    
reliable_orfs["orf_id"] = ["orf{}".format(x) for x in range(1,reliable_orfs.shape[0]+1) ]
reliable_orfs["cds_len"] = reliable_orfs["q_cds_end"] - reliable_orfs["q_cds_start"] +1
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_stats.csv",index=False,header=True)
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("3_filtered_orfs/filt_orf_list.txt",index=False,header=False,columns=["query_id"])

2.4.2 Extract fasta



In [10]:

    
! ~/utils/bin/seqtk subseq 1_orf/d9539_asm_v1.2_orf.fa 3_filtered_orfs/filt_orf_list.txt > 3_filtered_orfs/d9539_asm_v1.2_orf_filt.fa

2.4.3 Write out filtered blast hits



In [11]:

    
filt_blastp_hits = blastp_hits_annot[ blastp_hits_annot.query_id.apply(lambda x: x in reliable_orfs.query_id.tolist())]
filt_blastp_hits.to_csv("3_filtered_orfs/d9539_asm_v1.2_orf_filt_blastp.csv")
filt_blastp_hits.head()









    Out[11]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      mism
      gap_open
      q_start
      q_end
      s_start
      s_end
      ...
      q_cov
      s_description
      tool
      db
      q_id
      orf_len
      q_cds_start
      q_cds_end
      strand
      q_cov_calc
    
  
  
    
      10
      d9539_asm_v1.2_5
      MH0409_GL0061975
      91.379
      58
      5
      0
      1
      58
      29
      86
      ...
      100
      MH0409_GL0061975  [gene]  locus=scaffold117357...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_5
      58.0
      215
      388
      +
      100.0
    
    
      11
      d9539_asm_v1.2_5
      675950834-stool1_revised_C814893_1_gene71376
      89.655
      58
      6
      0
      1
      58
      1
      58
      ...
      100
      675950834-stool1_revised_C814893_1_gene71376 s...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_5
      58.0
      215
      388
      +
      100.0
    
    
      12
      d9539_asm_v1.2_5
      MH0105_GL0047475
      89.655
      58
      6
      0
      1
      58
      29
      86
      ...
      100
      MH0105_GL0047475  [gene]  locus=scaffold1014_1...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_5
      58.0
      215
      388
      +
      100.0
    
    
      13
      d9539_asm_v1.2_5
      MH0203_GL0004312
      89.655
      58
      6
      0
      1
      58
      29
      86
      ...
      100
      MH0203_GL0004312  [gene]  locus=C2075218_1:153...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_5
      58.0
      215
      388
      +
      100.0
    
    
      14
      d9539_asm_v1.2_5
      MH0033_GL0005948
      89.655
      58
      6
      0
      1
      58
      29
      86
      ...
      100
      MH0033_GL0005948  [gene]  locus=scaffold47970_...
      blastp
      metahit_2014_pep
      d9539_asm_v1.2_5
      58.0
      215
      388
      +
      100.0
    
  

5 rows × 28 columns

gb = filt_blastp_hits.groupby("query_id") for q_id,hits in gb: for db in ["metahit_pep","hmp_pep","env_nr","nr"]: try: os.mkdir("3_filtered_orfs/msa/{}".format(q_id)) except: pass hits_to_write = hits[hits.db == db]["subject_id"].unique() if hits_to_write.shape != (0,): hits_to_write.tofile("3_filtered_orfs/msa/{0}/{0}_{1}.txt".format(q_id,db),sep="\n")

	q_id	orf_len	q_cds_start	q_cds_end	strand
0	d9539_asm_v1.2_1	27.0	25	105	+
1	d9539_asm_v1.2_2	10.0	98	127	+
2	d9539_asm_v1.2_3	36.0	130	237	+
3	d9539_asm_v1.2_4	52.0	105	260	+
4	d9539_asm_v1.2_5	58.0	215	388	+

	query_id	subject_id	pct_id	ali_len	mism	gap_open	q_start	q_end	s_start	s_end	...	q_cov	s_description	tool	db	q_id	orf_len	q_cds_start	q_cds_end	strand	q_cov_calc
157	d9539_asm_v1.2_29	MH0413_GL0135536	93.354	647	43	0	1	647	1	647	...	100	MH0413_GL0135536 [gene] locus=scaffold143833...	blastp	metahit_2014_pep	d9539_asm_v1.2_29	647.0	3429	5369	+	100.0
158	d9539_asm_v1.2_29	MH0039_GL0018307	93.045	647	45	0	1	647	1	647	...	100	MH0039_GL0018307 [gene] locus=scaffold66015_...	blastp	metahit_2014_pep	d9539_asm_v1.2_29	647.0	3429	5369	+	100.0
159	d9539_asm_v1.2_29	MH0409_GL0129297	91.963	647	52	0	1	647	1	647	...	100	MH0409_GL0129297 [gene] locus=scaffold117357...	blastp	metahit_2014_pep	d9539_asm_v1.2_29	647.0	3429	5369	+	100.0
160	d9539_asm_v1.2_29	MH0033_GL0005946	89.352	648	68	1	1	647	1	648	...	100	MH0033_GL0005946 [gene] locus=scaffold47970_...	blastp	metahit_2014_pep	d9539_asm_v1.2_29	647.0	3429	5369	+	100.0
161	d9539_asm_v1.2_29	MH0136_GL0000112	89.352	648	68	1	1	647	1	648	...	100	MH0136_GL0000112 [gene] locus=C1259564_1:206...	blastp	metahit_2014_pep	d9539_asm_v1.2_29	647.0	3429	5369	+	100.0

	query_id	db	subject_id	pct_id	q_cov	q_len	bitscore	e_value	strand	q_cds_start	q_cds_end
317	d9539_asm_v1.2_1	metahit_2014_pep	MH0148_GL0013503	92.593	100	27	57.4	2.310000e-10	+	25	105
147	d9539_asm_v1.2_3	metahit_2014_pep	MH0203_GL0004312	90.323	86	36	57.8	1.450000e-10	+	130	237
10	d9539_asm_v1.2_5	metahit_2014_pep	MH0409_GL0061975	91.379	100	58	113.0	1.220000e-31	+	215	388
282	d9539_asm_v1.2_11	metahit_2014_pep	V1.FI30_GL0086214	91.449	100	421	801.0	0.000000e+00	+	393	1655
261	d9539_asm_v1.2_14	metahit_2014_pep	O2.UC35-0_GL0103272	73.333	86	174	221.0	2.390000e-71	+	1603	2124
247	d9539_asm_v1.2_16	metahit_2014_pep	DOM013_GL0003401	82.857	100	70	115.0	2.460000e-32	+	2179	2388
205	d9539_asm_v1.2_22	metahit_2014_pep	MH0398_GL0192275	90.831	97	361	658.0	0.000000e+00	+	2351	3433
157	d9539_asm_v1.2_29	metahit_2014_pep	MH0413_GL0135536	93.354	100	647	1255.0	0.000000e+00	+	3429	5369
124	d9539_asm_v1.2_30	metahit_2014_pep	MH0039_GL0018308	98.969	100	97	190.0	5.350000e-61	+	5374	5664