In [1]:

    
import pandas as pd
import re
import itertools
import matplotlib_venn
import numpy as np

1. Load blast hits , hmmer hits for MetaHIT and cluster stats



In [2]:

    
#Load blast best hits
filt_blast_hits = pd.read_csv("1_out/filtered_blast_all_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("1_out/filtered_hmmer_all_hits.csv")
print(filt_hmmer_hits.shape)

all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("1_out/cluster_stats.csv")
clusters["sample_origin"] = clusters["sample_origin"].apply(lambda x:x.replace(",",";"))









    



(3400, 23)
(1882, 22)



In [3]:

    
#Get MetaHIT 2014 IGC hits
metahit_2014 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" in x))]
print(metahit_2014.shape)
metahit_2014.head()









    



(3389, 37)






    Out[3]:






  
    
      
      ali_len
      best_dmn_bias
      best_dmn_bitscore
      best_dmn_e_value
      bias
      bitscore
      cluster
      db
      dne_clu
      dne_dom
      ...
      s_description
      s_end
      s_gi
      s_len
      s_names
      s_scinames
      s_start
      s_taxids
      subject_id
      tool
    
  
  
    
      1566
      270.0
      NaN
      NaN
      NaN
      NaN
      431.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      MH0019_GL0030364  [gene]  locus=scaffold39654_...
      35.0
      0.0
      549.0
      NaN
      NaN
      304.0
      NaN
      MH0019_GL0030364
      blastn
    
    
      1567
      270.0
      NaN
      NaN
      NaN
      NaN
      420.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      765620695-stool1_revised_scaffold38964_2_gene1...
      35.0
      0.0
      546.0
      NaN
      NaN
      304.0
      NaN
      765620695-stool1_revised_scaffold38964_2_gene1...
      blastn
    
    
      1568
      263.0
      NaN
      NaN
      NaN
      NaN
      254.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      MH0048_GL0036718  [gene]  locus=scaffold33886_...
      35.0
      0.0
      537.0
      NaN
      NaN
      294.0
      NaN
      MH0048_GL0036718
      blastn
    
    
      1569
      274.0
      NaN
      NaN
      NaN
      NaN
      409.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      V1.CD14-0_GL0044222  [gene]  locus=scaffold182...
      45.0
      0.0
      567.0
      NaN
      NaN
      318.0
      NaN
      V1.CD14-0_GL0044222
      blastn
    
    
      1570
      259.0
      NaN
      NaN
      NaN
      NaN
      401.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      V1.CD6-0-PT_GL0032215  [gene]  locus=scaffold1...
      42.0
      0.0
      564.0
      NaN
      NaN
      300.0
      NaN
      V1.CD6-0-PT_GL0032215
      blastn
    
  

5 rows × 37 columns



In [4]:

    
#Get MetaHIT 2010 hits
metahit_2010 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" not in x))]
print(metahit_2010.shape)
metahit_2010.head()









    



(1206, 37)






    Out[4]:






  
    
      
      ali_len
      best_dmn_bias
      best_dmn_bitscore
      best_dmn_e_value
      bias
      bitscore
      cluster
      db
      dne_clu
      dne_dom
      ...
      s_description
      s_end
      s_gi
      s_len
      s_names
      s_scinames
      s_start
      s_taxids
      subject_id
      tool
    
  
  
    
      536
      270.0
      NaN
      NaN
      NaN
      NaN
      431.0
      1073
      metahit_cds
      NaN
      NaN
      ...
      GL0053633_MH0019_[Complete]_[mRNA]_locus=scaff...
      35.0
      0.0
      549.0
      NaN
      NaN
      304.0
      NaN
      GL0053633_MH0019_[Complete]_[mRNA]_locus=scaff...
      blastn
    
    
      537
      259.0
      NaN
      NaN
      NaN
      NaN
      412.0
      1073
      metahit_cds
      NaN
      NaN
      ...
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      45.0
      0.0
      555.0
      NaN
      NaN
      303.0
      NaN
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      blastn
    
    
      538
      270.0
      NaN
      NaN
      NaN
      NaN
      243.0
      1073
      metahit_cds
      NaN
      NaN
      ...
      GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3...
      51.0
      0.0
      561.0
      NaN
      NaN
      315.0
      NaN
      GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3...
      blastn
    
    
      539
      273.0
      NaN
      NaN
      NaN
      NaN
      444.0
      1073
      metahit_cds
      NaN
      NaN
      ...
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      31.0
      0.0
      555.0
      NaN
      NaN
      303.0
      NaN
      GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
      blastn
    
    
      540
      283.0
      NaN
      NaN
      NaN
      NaN
      272.0
      1073
      metahit_cds
      NaN
      NaN
      ...
      GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3...
      38.0
      0.0
      561.0
      NaN
      NaN
      315.0
      NaN
      GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3...
      blastn
    
  

5 rows × 37 columns



In [5]:

    
print(clusters.shape)
clusters.head()









    



(32, 23)






    Out[5]:






  
    
      
      Cluster
      # nr seqs
      # raw seqs
      Score
      log RNAcode score
      P
      ORF length (aa)
      log ORF length
      complexity
      min
      ...
      min.1
      avg.1
      max.1
      # seqs with max length
      composite score
      DNA alignment
      protein alignment
      RNAcode figure
      cluster_id
      sample_origin
    
  
  
    
      0
      1217
      6
      6
      41.974
      1.62
      7.704e–10
      91
      1.96
      hi
      3.79
      ...
      38
      73.67
      87
      2
      7.57
      <a href="all_nt_aln/cluster1217_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster1217_AA.aln.txt">AA...
      <a href="rnacode_all/cluster1217_eps/hss-0.eps...
      1217
      feces
    
    
      1
      339b
      7
      11
      33.153
      1.52
      1.069e–06
      110
      2.04
      hi
      3.89
      ...
      63
      79.14
      106
      2
      7.52
      <a href="all_nt_aln/cluster339b_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster339b_AA.aln.txt">AA...
      <a href="rnacode_all/cluster339b_eps/hss-0.eps...
      339b
      feces
    
    
      2
      211b
      7
      22
      42.949
      1.63
      3.364e–14
      80
      1.90
      hi
      3.72
      ...
      31
      62.00
      73
      2
      7.49
      <a href="all_nt_aln/cluster211b_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster211b_AA.aln.txt">AA...
      <a href="rnacode_all/cluster211b_eps/hss-0.eps...
      211b
      feces
    
    
      3
      562
      7
      9
      39.417
      1.60
      4.592e–07
      61
      1.79
      hi
      3.78
      ...
      29
      49.57
      59
      8
      7.39
      <a href="all_nt_aln/cluster562_DNA.aln.txt">DN...
      <a href="all_aa_aln/cluster562_AA.aln.txt">AA</a>
      <a href="rnacode_all/cluster562_eps/hss-0.eps"...
      562
      CSF;serum
    
    
      4
      297a
      5
      14
      26.891
      1.43
      0.001
      106
      2.03
      hi
      3.90
      ...
      59
      96.60
      106
      8
      7.38
      <a href="all_nt_aln/cluster297a_DNA.aln.txt">D...
      <a href="all_aa_aln/cluster297a_AA.aln.txt">AA...
      <a href="rnacode_all/cluster297a_eps/hss-0.eps...
      297a
      feces
    
  

5 rows × 23 columns

Load Metahit annotations



In [6]:

    
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
                     "cohort_origin","phylum","genus","kegg","eggNOG",
                    "sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]

metahit_2014_annot = pd.read_csv("~/resources/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)

print(metahit_2014_annot.shape)
metahit_2014_annot.head()









    



(9879896, 14)






    Out[6]:






  
    
      
      gene_id
      gene_name
      gene_length
      gene_completness
      cohort_origin
      phylum
      genus
      kegg
      eggNOG
      sample_freq
      individual_freq
      kegg_categories
      eggnog_fx_categories
      cohort_assembled
    
  
  
    
      0
      1
      T2D-6A_GL0083352
      88230
      Complete
      CHN
      unknown
      unknown
      K01824
      COG5184
      0.224152
      0.236449
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN;USA
    
    
      1
      2
      V1.UC4-5_GL0154511
      88086
      Complete
      EUR
      unknown
      unknown
      K01824
      COG5184
      0.352802
      0.351402
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN;USA
    
    
      2
      3
      MH0198_GL0136826
      67464
      Lack 5'-end
      EUR
      unknown
      unknown
      K01824
      COG5184
      0.281768
      0.293458
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN
    
    
      3
      4
      V1.FI28_GL0106390
      45267
      Complete
      EUR
      unknown
      unknown
      K03924
      NOG12793
      0.278611
      0.267290
      Metabolism
      Function unknown
      EUR;USA
    
    
      4
      5
      MH0318_GL0144550
      42243
      Complete
      EUR
      unknown
      unknown
      K13730
      COG4886
      0.001579
      0.001869
      Infectious Diseases
      Function unknown
      EUR

3. Annotate hits with metadata



In [7]:

    
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()









    



(3389, 51)






    Out[7]:






  
    
      
      ali_len
      best_dmn_bias
      best_dmn_bitscore
      best_dmn_e_value
      bias
      bitscore
      cluster
      db
      dne_clu
      dne_dom
      ...
      cohort_origin
      phylum
      genus
      kegg
      eggNOG
      sample_freq
      individual_freq
      kegg_categories
      eggnog_fx_categories
      cohort_assembled
    
  
  
    
      0
      270.0
      NaN
      NaN
      NaN
      NaN
      431.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.041831
      0.045794
      unknown
      unknown
      EUR;CHN;USA
    
    
      1
      270.0
      NaN
      NaN
      NaN
      NaN
      420.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      USA
      unknown
      unknown
      unknown
      unknown
      0.014996
      0.015888
      unknown
      unknown
      USA
    
    
      2
      263.0
      NaN
      NaN
      NaN
      NaN
      254.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.116811
      0.111215
      unknown
      unknown
      EUR
    
    
      3
      274.0
      NaN
      NaN
      NaN
      NaN
      409.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.014996
      0.017757
      unknown
      unknown
      EUR;CHN;USA
    
    
      4
      259.0
      NaN
      NaN
      NaN
      NaN
      401.0
      1073
      metahit_2014_cds
      NaN
      NaN
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.021310
      0.024299
      unknown
      unknown
      EUR
    
  

5 rows × 51 columns

MetaHIT 2014

a. Tax annotation

Apparently there is no tax annotation for the hits we have in MetaHIT :D



In [8]:

    
metahit_2014_df.shape
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)









    



(0, 51)
(0, 51)

b. KEGG



In [9]:

    
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["cluster","tool","db","kegg","kegg_categories"]].drop_duplicates()









    Out[9]:






  
    
      
      cluster
      tool
      db
      kegg
      kegg_categories
    
  
  
    
      619
      348
      blastn
      metahit_2014_cds
      K02314
      Replication and Repair
    
    
      1504
      348
      blastp
      metahit_2014_pep
      K02314
      Replication and Repair
    
    
      2897
      348
      hmmsearch
      metahit_2014_pep
      K02314
      Replication and Repair
    
    
      3234
      532
      hmmsearch
      metahit_2014_pep
      K06915
      Poorly Characterized

c. eggNOG



In [10]:

    
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["cluster","tool","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()









    Out[10]:






  
    
      
      cluster
      tool
      db
      eggNOG
      eggnog_fx_categories
    
  
  
    
      619
      348
      blastn
      metahit_2014_cds
      COG0305
      Replication, recombination and repair
    
    
      1504
      348
      blastp
      metahit_2014_pep
      COG0305
      Replication, recombination and repair
    
    
      1882
      1085
      hmmsearch
      metahit_2014_pep
      NOG269643
      unknown
    
    
      2897
      348
      hmmsearch
      metahit_2014_pep
      COG0305
      Replication, recombination and repair
    
    
      3234
      532
      hmmsearch
      metahit_2014_pep
      COG0433
      General function prediction only

d. Individual and sample frequency in the MetaHIT samples



In [11]:

    
gb = metahit_2014_df[["cluster","individual_freq"]].groupby("cluster")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )

gb = metahit_2014_df[["cluster","sample_freq"]].groupby("cluster")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )

pd.merge(max_individual_freq,max_sample_freq,on="cluster",how="outer")









    Out[11]:






  
    
      
      cluster
      individual_freq
      sample_freq
    
  
  
    
      0
      1073
      0.295327
      0.298343
    
    
      1
      1085
      0.004673
      0.004736
    
    
      2
      113b
      0.029907
      0.025257
    
    
      3
      1217
      0.228972
      0.220205
    
    
      4
      179a
      0.024299
      0.025257
    
    
      5
      179b
      0.024299
      0.025257
    
    
      6
      182a
      0.027103
      0.026835
    
    
      7
      182b
      0.027103
      0.026835
    
    
      8
      211a
      0.024299
      0.023678
    
    
      9
      211b
      0.024299
      0.023678
    
    
      10
      241a
      0.024299
      0.023678
    
    
      11
      241b
      0.034579
      0.033938
    
    
      12
      258
      0.035514
      0.037885
    
    
      13
      297a
      0.043925
      0.052092
    
    
      14
      297b
      0.043925
      0.052092
    
    
      15
      321b
      0.018692
      0.022099
    
    
      16
      339b
      0.019626
      0.022099
    
    
      17
      348
      0.210280
      0.217048
    
    
      18
      375a
      0.024299
      0.025257
    
    
      19
      406b
      0.168224
      0.161800
    
    
      20
      502
      0.024299
      0.023678
    
    
      21
      532
      0.121495
      0.127072
    
    
      22
      589
      0.017757
      0.018942
    
    
      23
      612a
      0.252336
      0.257301
    
    
      24
      787b
      0.044860
      0.056038



In [ ]:

	ali_len	best_dmn_bias	best_dmn_bitscore	best_dmn_e_value	bias	bitscore	cluster	db	dne_clu	dne_dom	...	s_description	s_end	s_len	s_names	s_scinames	s_start	s_taxids	subject_id	tool
1566	270.0	NaN	NaN	NaN	NaN	431.0	1073	metahit_2014_cds	NaN	NaN	...	MH0019_GL0030364 [gene] locus=scaffold39654_...	35.0	549.0	NaN	NaN	304.0	NaN	MH0019_GL0030364	blastn
1567	270.0	NaN	NaN	NaN	NaN	420.0	1073	metahit_2014_cds	NaN	NaN	...	765620695-stool1_revised_scaffold38964_2_gene1...	35.0	546.0	NaN	NaN	304.0	NaN	765620695-stool1_revised_scaffold38964_2_gene1...	blastn
1568	263.0	NaN	NaN	NaN	NaN	254.0	1073	metahit_2014_cds	NaN	NaN	...	MH0048_GL0036718 [gene] locus=scaffold33886_...	35.0	537.0	NaN	NaN	294.0	NaN	MH0048_GL0036718	blastn
1569	274.0	NaN	NaN	NaN	NaN	409.0	1073	metahit_2014_cds	NaN	NaN	...	V1.CD14-0_GL0044222 [gene] locus=scaffold182...	45.0	567.0	NaN	NaN	318.0	NaN	V1.CD14-0_GL0044222	blastn
1570	259.0	NaN	NaN	NaN	NaN	401.0	1073	metahit_2014_cds	NaN	NaN	...	V1.CD6-0-PT_GL0032215 [gene] locus=scaffold1...	42.0	564.0	NaN	NaN	300.0	NaN	V1.CD6-0-PT_GL0032215	blastn

	Cluster	# nr seqs	# raw seqs	Score	log RNAcode score	P	ORF length (aa)	log ORF length	complexity	min	...	min.1	avg.1	max.1	# seqs with max length	composite score	DNA alignment	protein alignment	RNAcode figure	cluster_id	sample_origin
0	1217	6	6	41.974	1.62	7.704e–10	91	1.96	hi	3.79	...	38	73.67	87	2	7.57	<a href="all_nt_aln/cluster1217_DNA.aln.txt">D...	<a href="all_aa_aln/cluster1217_AA.aln.txt">AA...	<a href="rnacode_all/cluster1217_eps/hss-0.eps...	1217	feces
1	339b	7	11	33.153	1.52	1.069e–06	110	2.04	hi	3.89	...	63	79.14	106	2	7.52	<a href="all_nt_aln/cluster339b_DNA.aln.txt">D...	<a href="all_aa_aln/cluster339b_AA.aln.txt">AA...	<a href="rnacode_all/cluster339b_eps/hss-0.eps...	339b	feces
2	211b	7	22	42.949	1.63	3.364e–14	80	1.90	hi	3.72	...	31	62.00	73	2	7.49	<a href="all_nt_aln/cluster211b_DNA.aln.txt">D...	<a href="all_aa_aln/cluster211b_AA.aln.txt">AA...	<a href="rnacode_all/cluster211b_eps/hss-0.eps...	211b	feces
3	562	7	9	39.417	1.60	4.592e–07	61	1.79	hi	3.78	...	29	49.57	59	8	7.39	<a href="all_nt_aln/cluster562_DNA.aln.txt">DN...	<a href="all_aa_aln/cluster562_AA.aln.txt">AA</a>	<a href="rnacode_all/cluster562_eps/hss-0.eps"...	562	CSF;serum
4	297a	5	14	26.891	1.43	0.001	106	2.03	hi	3.90	...	59	96.60	106	8	7.38	<a href="all_nt_aln/cluster297a_DNA.aln.txt">D...	<a href="all_aa_aln/cluster297a_AA.aln.txt">AA...	<a href="rnacode_all/cluster297a_eps/hss-0.eps...	297a	feces

	gene_id	gene_name	gene_length	gene_completness	cohort_origin	phylum	genus	kegg	eggNOG	sample_freq	individual_freq	kegg_categories	eggnog_fx_categories	cohort_assembled
0	1	T2D-6A_GL0083352	88230	Complete	CHN	unknown	unknown	K01824	COG5184	0.224152	0.236449	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN;USA
1	2	V1.UC4-5_GL0154511	88086	Complete	EUR	unknown	unknown	K01824	COG5184	0.352802	0.351402	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN;USA
2	3	MH0198_GL0136826	67464	Lack 5'-end	EUR	unknown	unknown	K01824	COG5184	0.281768	0.293458	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN
3	4	V1.FI28_GL0106390	45267	Complete	EUR	unknown	unknown	K03924	NOG12793	0.278611	0.267290	Metabolism	Function unknown	EUR;USA
4	5	MH0318_GL0144550	42243	Complete	EUR	unknown	unknown	K13730	COG4886	0.001579	0.001869	Infectious Diseases	Function unknown	EUR

	cluster	tool	db	kegg	kegg_categories
619	348	blastn	metahit_2014_cds	K02314	Replication and Repair
1504	348	blastp	metahit_2014_pep	K02314	Replication and Repair
2897	348	hmmsearch	metahit_2014_pep	K02314	Replication and Repair
3234	532	hmmsearch	metahit_2014_pep	K06915	Poorly Characterized

	cluster	tool	db	eggNOG	eggnog_fx_categories
619	348	blastn	metahit_2014_cds	COG0305	Replication, recombination and repair
1504	348	blastp	metahit_2014_pep	COG0305	Replication, recombination and repair
1882	1085	hmmsearch	metahit_2014_pep	NOG269643	unknown
2897	348	hmmsearch	metahit_2014_pep	COG0305	Replication, recombination and repair
3234	532	hmmsearch	metahit_2014_pep	COG0433	General function prediction only

	cluster	individual_freq	sample_freq
0	1073	0.295327	0.298343
1	1085	0.004673	0.004736
2	113b	0.029907	0.025257
3	1217	0.228972	0.220205
4	179a	0.024299	0.025257
5	179b	0.024299	0.025257
6	182a	0.027103	0.026835
7	182b	0.027103	0.026835
8	211a	0.024299	0.023678
9	211b	0.024299	0.023678
10	241a	0.024299	0.023678
11	241b	0.034579	0.033938
12	258	0.035514	0.037885
13	297a	0.043925	0.052092
14	297b	0.043925	0.052092
15	321b	0.018692	0.022099
16	339b	0.019626	0.022099
17	348	0.210280	0.217048
18	375a	0.024299	0.025257
19	406b	0.168224	0.161800
20	502	0.024299	0.023678
21	532	0.121495	0.127072
22	589	0.017757	0.018942
23	612a	0.252336	0.257301
24	787b	0.044860	0.056038