notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import pandas as pd
import re
import itertools
import matplotlib_venn



In [3]:

    
#Load blast best hits
filt_blast_hits = pd.read_csv("filtered_blast_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("filtered_hmmer_hits.csv")
print(filt_hmmer_hits.shape)

all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("cluster_stats.csv")









    



(66, 11)
(29, 9)



In [4]:

    
print(all_filt_hits.shape)
all_filt_hits.head()









    



(95, 12)






    Out[4]:






  
    
      
      best_dmn_e_value
      bitscore
      cluster
      db
      e_value
      pct_id
      q_cov
      q_len
      query_id
      s_description
      subject_id
      tool
    
  
  
    
      0
      NaN
      479.0
      1073
      env_nt
      1.000000e-131
      98.180
      99.0
      276.0
      GB3LKKR01A150I
      gut metagenome genome assembly P2E0-k21-2014-0...
      gi|936108378|emb|CEAB01076172.1|
      blastn
    
    
      1
      NaN
      444.0
      1073
      metahit_cds
      1.010000e-122
      95.971
      96.0
      285.0
      contig07331
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      blastn
    
    
      2
      NaN
      159.0
      1073
      metahit_pep
      3.240000e-48
      97.727
      96.0
      92.0
      contig07331
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      blastp
    
    
      3
      NaN
      165.0
      113b
      metahit_cds
      2.240000e-39
      98.913
      100.0
      92.0
      GB3LKKR01C5IFF
      GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...
      GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...
      blastn
    
    
      4
      NaN
      100.0
      113b
      env_nt
      3.000000e-18
      87.360
      89.0
      98.0
      GB3LKKR02F10X4
      Gut metagenome Scaffold6187_1, whole genome sh...
      gi|557595202|gb|AVOA01007617.1|
      blastn



In [5]:

    
print(clusters.shape)
clusters.head()









    



(32, 23)






    Out[5]:






  
    
      
      Cluster
      # nr seqs
      # raw seqs
      Score
      log RNAcode score
      P
      ORF length (aa)
      log ORF length
      complexity
      min
      ...
      min.1
      avg.1
      max.1
      # seqs with max length
      composite score
      DNA alignment
      protein alignment
      RNAcode figure
      cluster_id
      sample_origin
    
  
  
    
      0
      1217
      6
      6
      41.974
      1.62
      7.704e–10
      91
      1.96
      hi
      3.79
      ...
      38
      73.67
      87
      2
      7.57
      <a href="all_nt_aln/cluster1217_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster1217_AA.aln.txt">AA...
      <a href="rnacode_all/cluster1217_eps/hss-0.eps...
      1217
      feces
    
    
      1
      339b
      7
      11
      33.153
      1.52
      1.069e–06
      110
      2.04
      hi
      3.89
      ...
      63
      79.14
      106
      2
      7.52
      <a href="all_nt_aln/cluster339b_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster339b_AA.aln.txt">AA...
      <a href="rnacode_all/cluster339b_eps/hss-0.eps...
      339b
      feces
    
    
      2
      211b
      7
      22
      42.949
      1.63
      3.364e–14
      80
      1.90
      hi
      3.72
      ...
      31
      62.00
      73
      2
      7.49
      <a href="all_nt_aln/cluster211b_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster211b_AA.aln.txt">AA...
      <a href="rnacode_all/cluster211b_eps/hss-0.eps...
      211b
      feces
    
    
      3
      562
      7
      9
      39.417
      1.60
      4.592e–07
      61
      1.79
      hi
      3.78
      ...
      29
      49.57
      59
      8
      7.39
      <a href="all_nt_aln/cluster562_DNA.aln.txt">DN...
      <a href="all_aa_aln/cluster562_AA.aln.txt">AA</a>
      <a href="rnacode_all/cluster562_eps/hss-0.eps"...
      562
      CSF,serum
    
    
      4
      297a
      5
      14
      26.891
      1.43
      0.001
      106
      2.03
      hi
      3.90
      ...
      59
      96.60
      106
      8
      7.38
      <a href="all_nt_aln/cluster297a_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster297a_AA.aln.txt">AA...
      <a href="rnacode_all/cluster297a_eps/hss-0.eps...
      297a
      feces
    
  

5 rows × 23 columns

Stats

Hits per cluster



In [8]:

    
tmp = [(cluster,hits.shape[0],";".join(hits.db)) for cluster,hits in ( all_filt_hits[["cluster","db"]].drop_duplicates().groupby("cluster") )]
hits_x_cluster = pd.DataFrame.from_records(tmp,columns=["cluster","db_hit_count","dbs"])
print("Cluster families with no annotation: {}".format(clusters.shape[0]-hits_x_cluster.shape[0]))
hits_x_cluster = hits_x_cluster.merge(clusters,left_on="cluster",right_on="Cluster")[["cluster","db_hit_count","dbs","sample_origin"]]
hits_x_cluster.sort_values(by="db_hit_count",ascending=False).head()
hits_x_cluster.to_csv("hits_per_cluster.csv",index=False)









    



Cluster families with no annotation: 3

Hits per database



In [7]:

    
all_filt_hits[["cluster","db"]].drop_duplicates().groupby("db").count()









    Out[7]:






  
    
      
      cluster
    
    
      db
      
    
  
  
    
      env_nr
      2
    
    
      env_nt
      26
    
    
      hmp_pep
      1
    
    
      metahit_cds
      18
    
    
      metahit_pep
      22
    
    
      nr
      4



In [23]:

    
metahit_cds_list = set(all_filt_hits[all_filt_hits.db == "metahit_cds"]["cluster"].drop_duplicates().tolist())
metahit_pep_list = set(all_filt_hits[all_filt_hits.db == "metahit_pep"]["cluster"].drop_duplicates().tolist())
env_nt_list = set(all_filt_hits[all_filt_hits.db == "env_nt"]["cluster"].drop_duplicates().tolist())



In [31]:

    
for x in env_nt_list:
    print(x)



In [ ]:

	best_dmn_e_value	bitscore	cluster	db	e_value	pct_id	q_cov	q_len	query_id	s_description	subject_id	tool
0	NaN	479.0	1073	env_nt	1.000000e-131	98.180	99.0	276.0	GB3LKKR01A150I	gut metagenome genome assembly P2E0-k21-2014-0...	gi\|936108378\|emb\|CEAB01076172.1\|	blastn
1	NaN	444.0	1073	metahit_cds	1.010000e-122	95.971	96.0	285.0	contig07331	GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...	GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...	blastn
2	NaN	159.0	1073	metahit_pep	3.240000e-48	97.727	96.0	92.0	contig07331	GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...	GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...	blastp
3	NaN	165.0	113b	metahit_cds	2.240000e-39	98.913	100.0	92.0	GB3LKKR01C5IFF	GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...	GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...	blastn
4	NaN	100.0	113b	env_nt	3.000000e-18	87.360	89.0	98.0	GB3LKKR02F10X4	Gut metagenome Scaffold6187_1, whole genome sh...	gi\|557595202\|gb\|AVOA01007617.1\|	blastn

	Cluster	# nr seqs	# raw seqs	Score	log RNAcode score	P	ORF length (aa)	log ORF length	complexity	min	...	min.1	avg.1	max.1	# seqs with max length	composite score	DNA alignment	protein alignment	RNAcode figure	cluster_id	sample_origin
0	1217	6	6	41.974	1.62	7.704e–10	91	1.96	hi	3.79	...	38	73.67	87	2	7.57	<a href="all_nt_aln/cluster1217_DNA.aln.txt">D...	<a href="all_aa_aln/cluster1217_AA.aln.txt">AA...	<a href="rnacode_all/cluster1217_eps/hss-0.eps...	1217	feces
1	339b	7	11	33.153	1.52	1.069e–06	110	2.04	hi	3.89	...	63	79.14	106	2	7.52	<a href="all_nt_aln/cluster339b_DNA.aln.txt">D...	<a href="all_aa_aln/cluster339b_AA.aln.txt">AA...	<a href="rnacode_all/cluster339b_eps/hss-0.eps...	339b	feces
2	211b	7	22	42.949	1.63	3.364e–14	80	1.90	hi	3.72	...	31	62.00	73	2	7.49	<a href="all_nt_aln/cluster211b_DNA.aln.txt">D...	<a href="all_aa_aln/cluster211b_AA.aln.txt">AA...	<a href="rnacode_all/cluster211b_eps/hss-0.eps...	211b	feces
3	562	7	9	39.417	1.60	4.592e–07	61	1.79	hi	3.78	...	29	49.57	59	8	7.39	<a href="all_nt_aln/cluster562_DNA.aln.txt">DN...	<a href="all_aa_aln/cluster562_AA.aln.txt">AA</a>	<a href="rnacode_all/cluster562_eps/hss-0.eps"...	562	CSF,serum
4	297a	5	14	26.891	1.43	0.001	106	2.03	hi	3.90	...	59	96.60	106	8	7.38	<a href="all_nt_aln/cluster297a_DNA.aln.txt">D...	<a href="all_aa_aln/cluster297a_AA.aln.txt">AA...	<a href="rnacode_all/cluster297a_eps/hss-0.eps...	297a	feces