In [1]:

    
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

Load blast hits



In [2]:

    
#Load blast hits
all_blast_hits = pd.read_csv("all_blast_hits.csv")
#Separate blastn from blastp hits
blastn_hits = all_blast_hits[all_blast_hits.tool == "blastn"]
blastp_hits = all_blast_hits[all_blast_hits.tool == "blastp"]

1. Analyze blastn hits

1.1 Extract best env_nt hits to perform a genome MSA

The main goal is to perform an MSA between our D9539 assembly and the similar seqs in the dbs



In [3]:

    
#List of sequences to extract
seqs_for_msa = blastn_hits[blastn_hits.db == "env_nt"].sort_values(by="ali_len",ascending=False).head(n=10)
#Export megahit ids to extract directly from fasta : Empty!
#seqs_for_msa[seqs_for_msa.db == "hmp_nuc"]["subject_id"].to_csv("d9539_hmp_homologs.txt",sep="\t",index=False,header=False)

Obtain fastas for env_nt homologs from eutils



In [4]:

    
#Use efetch to extract and save to a file the fsta with the sequences
gis_to_get = ",".join(set(str(x) for x in seqs_for_msa[seqs_for_msa.db == "env_nt"]["gi"]))
r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=fasta&retmode=text".format(gis_to_get))
with open("3_assembly_msa/d9539_env_nt_homologs.fa","w") as env_nt_fh:
    env_nt_fh.write(str(r.content.decode()))



In [5]:

    
#Remove whitespace between seqs
!sed "/^$/d" 3_assembly_msa/d9539_env_nt_homologs.fa > 3_assembly_msa/d9539_env_nt_homologs.fasta
!rm 3_assembly_msa/d9539_env_nt_homologs.fa

2. Process blastp results

2.1 Extract ORF stats from fasta file



In [6]:

    
#Assumes the Fasta file comes with the header format of EMBOSS getorf
fh = open("1_orfs/d9539_asm_v1.2_emboss_orf1.fa")
header_regex = re.compile(r">(.+_[0-9]+?) \[([0-9]+) - ([0-9]+)\] ")
orf_stats = []
for line in fh:
    header_match = header_regex.match(line)
    if header_match:
        is_reverse = line.rstrip(" \n").endswith("(REVERSE SENSE)")
        q_id = header_match.group(1)
        #Position in contig
        q_cds_start = int(header_match.group(2) if not is_reverse else header_match.group(3))
        q_cds_end = int(header_match.group(3) if not is_reverse else header_match.group(2))
        #Length of orf in aminoacids
        q_len = (q_cds_end - q_cds_start + 1) / 3
        orf_stats.append( pd.Series(data=[q_id,q_len,q_cds_start,q_cds_end,("-" if is_reverse else "+")],
                                     index=["q_id","q_len","q_cds_start","q_cds_end","strand"]))
        
orf_stats_df = pd.DataFrame(orf_stats)
orf_stats_df.head()









    Out[6]:






  
    
      
      q_id
      q_len
      q_cds_start
      q_cds_end
      strand
    
  
  
    
      0
      ctg_cut_pos_2072_1
      27
      25
      105
      +
    
    
      1
      ctg_cut_pos_2072_2
      10
      98
      127
      +
    
    
      2
      ctg_cut_pos_2072_3
      36
      130
      237
      +
    
    
      3
      ctg_cut_pos_2072_4
      52
      105
      260
      +
    
    
      4
      ctg_cut_pos_2072_5
      58
      215
      388
      +



In [7]:

    
#Write orf stats to fasta
orf_stats_df.to_csv("orf_stats.csv",index=False)

2.2 Annotate blast hits with orf stats



In [8]:

    
blastp_hits_annot = blastp_hits.merge(orf_stats_df,left_on="query_id",right_on="q_id")
#Add query coverage calculation
blastp_hits_annot["q_cov"] = (blastp_hits_annot["q_end"] - blastp_hits_annot["q_start"] + 1 ) / blastp_hits_annot["q_len"]
blastp_hits_annot.sort_values(by="bitscore",ascending=False).head()









    Out[8]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      mism
      gap_open
      q_start
      q_end
      s_start
      s_end
      ...
      gi
      description
      tax_id
      s_len
      q_id
      q_len
      q_cds_start
      q_cds_end
      strand
      q_cov
    
  
  
    
      98
      ctg_cut_pos_2072_29
      GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff...
      93.04
      647
      45
      0
      1
      647
      1
      647
      ...
      NaN
      NaN
      NaN
      NaN
      ctg_cut_pos_2072_29
      647
      3429
      5369
      +
      1.000000
    
    
      99
      ctg_cut_pos_2072_29
      GL0107162_V1_[Complete]_[mRNA]_locus=scaffold7...
      87.81
      648
      78
      1
      1
      647
      1
      648
      ...
      NaN
      NaN
      NaN
      NaN
      ctg_cut_pos_2072_29
      647
      3429
      5369
      +
      1.000000
    
    
      100
      ctg_cut_pos_2072_29
      GL0033034_MH0077_[Lack_both_end]_[mRNA]_locus=...
      86.33
      439
      59
      1
      170
      607
      1
      439
      ...
      NaN
      NaN
      NaN
      NaN
      ctg_cut_pos_2072_29
      647
      3429
      5369
      +
      0.676971
    
    
      220
      ctg_cut_pos_2072_11
      GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff...
      93.11
      421
      29
      0
      1
      421
      1
      421
      ...
      NaN
      NaN
      NaN
      NaN
      ctg_cut_pos_2072_11
      421
      393
      1655
      +
      1.000000
    
    
      221
      ctg_cut_pos_2072_11
      GL0107158_V1_[Complete]_[mRNA]_locus=scaffold7...
      91.45
      421
      36
      0
      1
      421
      1
      421
      ...
      NaN
      NaN
      NaN
      NaN
      ctg_cut_pos_2072_11
      421
      393
      1655
      +
      1.000000
    
  

5 rows × 24 columns

2.3 Extract best hit for each ORF ( q_cov > 0.8 and pct_id > 30% and e-value < 1)



In [9]:

    
! mkdir -p 4_msa_prots



In [12]:

    
#Get best hit (highest bitscore) for each ORF
gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 0.8) & (blastp_hits_annot.pct_id > 30) & (blastp_hits_annot.e_value < 1) ].groupby("query_id")
reliable_orfs = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for q_id,hits in gb )[["query_id","db","subject_id","pct_id","q_cov","q_len",
                                                                                      "bitscore","e_value","strand","q_cds_start","q_cds_end"]]
reliable_orfs.sort_values(by="q_cds_start",ascending=True).to_csv("4_msa_prots/filtered_orfs_by_hits.csv",index=False)
reliable_orfs.sort_values(by="q_cds_start",ascending=True)









    Out[12]:






  
    
      
      query_id
      db
      subject_id
      pct_id
      q_cov
      q_len
      bitscore
      e_value
      strand
      q_cds_start
      q_cds_end
    
  
  
    
      4
      ctg_cut_pos_2072_5
      metahit_pep
      GL0012681_MH0069_[Lack_3'-end]_[mRNA]_locus=C6...
      50.00
      0.827586
      58
      55.5
      8.000000e-10
      +
      215
      388
    
    
      220
      ctg_cut_pos_2072_11
      metahit_pep
      GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff...
      93.11
      1.000000
      421
      790.0
      0.000000e+00
      +
      393
      1655
    
    
      200
      ctg_cut_pos_2072_14
      metahit_pep
      GL0047932_O2_[Complete]_[mRNA]_locus=scaffold3...
      66.47
      0.994253
      174
      226.0
      3.000000e-73
      +
      1603
      2124
    
    
      190
      ctg_cut_pos_2072_16
      metahit_pep
      GL0027811_MH0033_[Complete]_[mRNA]_locus=C3777...
      77.14
      1.000000
      70
      111.0
      3.000000e-31
      +
      2179
      2388
    
    
      143
      ctg_cut_pos_2072_22
      metahit_pep
      GL0027812_MH0033_[Complete]_[mRNA]_locus=C3777...
      91.92
      0.925208
      361
      637.0
      0.000000e+00
      +
      2351
      3433
    
    
      98
      ctg_cut_pos_2072_29
      metahit_pep
      GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff...
      93.04
      1.000000
      647
      1251.0
      0.000000e+00
      +
      3429
      5369
    
    
      76
      ctg_cut_pos_2072_30
      metahit_pep
      GL0127673_MH0014_[Complete]_[mRNA]_locus=scaff...
      96.91
      1.000000
      97
      187.0
      3.000000e-60
      +
      5374
      5664

5.4 Extract all hits for filtered ORF list ( q_cov > 0.8 and pct_id > 30% )

gb = blastp_hits_annot[ (blastp_hits_annot.q_cov > 0.8) & (blastp_hits_annot.pct_id > 30) ].groupby("query_id") for q_id,hits in gb: for db in ["metahit_pep","hmp_pep","env_nr","nr"]: try: os.mkdir("4_msa_prots/{}".format(q_id)) except: pass hits_to_write = hits[hits.db == db]["subject_id"].unique() if hits_to_write.shape != (0,): hits_to_write.tofile("4_msa_prots/{0}/{0}_{1}.txt".format(q_id,db),sep="\n")



In [ ]:

    
glob("2_hmmscan/*.tbl")

	q_id	q_len	q_cds_start	q_cds_end	strand
0	ctg_cut_pos_2072_1	27	25	105	+
1	ctg_cut_pos_2072_2	10	98	127	+
2	ctg_cut_pos_2072_3	36	130	237	+
3	ctg_cut_pos_2072_4	52	105	260	+
4	ctg_cut_pos_2072_5	58	215	388	+

	query_id	subject_id	pct_id	ali_len	mism	gap_open	q_start	q_end	s_start	s_end	...	gi	description	tax_id	s_len	q_id	q_len	q_cds_start	q_cds_end	strand	q_cov
98	ctg_cut_pos_2072_29	GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff...	93.04	647	45	0	1	647	1	647	...	NaN	NaN	NaN	NaN	ctg_cut_pos_2072_29	647	3429	5369	+	1.000000
99	ctg_cut_pos_2072_29	GL0107162_V1_[Complete]_[mRNA]_locus=scaffold7...	87.81	648	78	1	1	647	1	648	...	NaN	NaN	NaN	NaN	ctg_cut_pos_2072_29	647	3429	5369	+	1.000000
100	ctg_cut_pos_2072_29	GL0033034_MH0077_[Lack_both_end]_[mRNA]_locus=...	86.33	439	59	1	170	607	1	439	...	NaN	NaN	NaN	NaN	ctg_cut_pos_2072_29	647	3429	5369	+	0.676971
220	ctg_cut_pos_2072_11	GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff...	93.11	421	29	0	1	421	1	421	...	NaN	NaN	NaN	NaN	ctg_cut_pos_2072_11	421	393	1655	+	1.000000
221	ctg_cut_pos_2072_11	GL0107158_V1_[Complete]_[mRNA]_locus=scaffold7...	91.45	421	36	0	1	421	1	421	...	NaN	NaN	NaN	NaN	ctg_cut_pos_2072_11	421	393	1655	+	1.000000

	query_id	db	subject_id	pct_id	q_cov	q_len	bitscore	e_value	strand	q_cds_start	q_cds_end
4	ctg_cut_pos_2072_5	metahit_pep	GL0012681_MH0069_[Lack_3'-end]_[mRNA]_locus=C6...	50.00	0.827586	58	55.5	8.000000e-10	+	215	388
220	ctg_cut_pos_2072_11	metahit_pep	GL0047057_MH0039_[Complete]_[mRNA]_locus=scaff...	93.11	1.000000	421	790.0	0.000000e+00	+	393	1655
200	ctg_cut_pos_2072_14	metahit_pep	GL0047932_O2_[Complete]_[mRNA]_locus=scaffold3...	66.47	0.994253	174	226.0	3.000000e-73	+	1603	2124
190	ctg_cut_pos_2072_16	metahit_pep	GL0027811_MH0033_[Complete]_[mRNA]_locus=C3777...	77.14	1.000000	70	111.0	3.000000e-31	+	2179	2388
143	ctg_cut_pos_2072_22	metahit_pep	GL0027812_MH0033_[Complete]_[mRNA]_locus=C3777...	91.92	0.925208	361	637.0	0.000000e+00	+	2351	3433
98	ctg_cut_pos_2072_29	metahit_pep	GL0047061_MH0039_[Complete]_[mRNA]_locus=scaff...	93.04	1.000000	647	1251.0	0.000000e+00	+	3429	5369
76	ctg_cut_pos_2072_30	metahit_pep	GL0127673_MH0014_[Complete]_[mRNA]_locus=scaff...	96.91	1.000000	97	187.0	3.000000e-60	+	5374	5664