In [1]:

    
import pandas as pd
import re



In [2]:

    
! mkdir -p out/



In [3]:

    
blast_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_blastp.csv")
print(blast_hits.shape)
hmmer_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_hmmsearch_dom.csv")
print(hmmer_hits.shape)









    



(200, 28)
(6, 23)

1. Analyze nr and hmp_pep hits



In [5]:

    
other_blastp_hits = blast_hits[ (blast_hits.db == "nr") | (blast_hits.db == "hmp_pep")]
other_blastp_hits = other_blastp_hits.drop(["tool","q_id","mism","q_cov_calc","s_scinames","s_names","s_gi","s_taxids"],1)
#hmp database seesm to have duplicated seqs. Remove duplicated hits
print("Duplicate hits")
print(other_blastp_hits[other_blastp_hits.duplicated()][["query_id","subject_id","db"]])
other_blastp_hits = other_blastp_hits.drop_duplicates()









    



Duplicate hits
              query_id        subject_id       db
25    d9539_asm_v1.2_5   HMPREF9136_2685  hmp_pep
175  d9539_asm_v1.2_29  HMPREF1059_03257  hmp_pep
186  d9539_asm_v1.2_29   HMPREF9141_0985  hmp_pep
192  d9539_asm_v1.2_29      BACEGG_02721  hmp_pep
194  d9539_asm_v1.2_29  HMPREF1018_02255  hmp_pep
196  d9539_asm_v1.2_29    BACCOPRO_03325  hmp_pep
221  d9539_asm_v1.2_22  HMPREF1059_03256  hmp_pep
238  d9539_asm_v1.2_22   HMPREF9141_0984  hmp_pep
240  d9539_asm_v1.2_22  HMPREF0991_01395  hmp_pep
294  d9539_asm_v1.2_11      BACEGG_02723  hmp_pep
301  d9539_asm_v1.2_11   HMPREF0645_1833  hmp_pep
306  d9539_asm_v1.2_11   HMPREF9141_0987  hmp_pep
309  d9539_asm_v1.2_11   HMPREF9145_2738  hmp_pep
311  d9539_asm_v1.2_11   HMPREF0860_1608  hmp_pep

2. Get blast hits for nr & hmp, getting the best hit for each ORF and any other hit within 5 bits of the best score for the ORF



In [6]:

    
other_blastp_hits_filt = other_blastp_hits[other_blastp_hits.e_value < 1]

filtered_hits = []
for _,hits in other_blastp_hits_filt.groupby(["query_id","db"]):
    top_bitscore_hit = hits.ix[hits["bitscore"].idxmax()]
    filtered_hits.append ( hits[hits["bitscore"] > top_bitscore_hit["bitscore"] - 5 ])
                                        

filtered_hits_df = pd.concat(filtered_hits)
print(filtered_hits_df.groupby(["query_id","db"])["subject_id"].count())
best_blast_hits = filtered_hits_df[["query_id","db","s_description","pct_id","q_cov","s_description","q_len","s_len"]]
best_blast_hits.to_csv("out/filt_orf_best_blast_hits.csv",index=False)
best_blast_hits









    



query_id           db     
d9539_asm_v1.2_11  hmp_pep    1
                   nr         2
d9539_asm_v1.2_22  hmp_pep    1
                   nr         1
d9539_asm_v1.2_29  hmp_pep    1
                   nr         2
d9539_asm_v1.2_30  nr         3
Name: subject_id, dtype: int64






    Out[6]:






  
    
      
      query_id
      db
      s_description
      pct_id
      q_cov
      s_description
      q_len
      s_len
    
  
  
    
      293
      d9539_asm_v1.2_11
      hmp_pep
      BACEGG_02723 hypothetical protein [Bacteroides...
      40.323
      15
      BACEGG_02723 hypothetical protein [Bacteroides...
      421
      368
    
    
      292
      d9539_asm_v1.2_11
      nr
      hypothetical protein [Eel River basin pequenov...
      27.404
      48
      hypothetical protein [Eel River basin pequenov...
      421
      277
    
    
      295
      d9539_asm_v1.2_11
      nr
      hypothetical protein [Bacteroides eggerthii]
      40.323
      15
      hypothetical protein [Bacteroides eggerthii]
      421
      368
    
    
      220
      d9539_asm_v1.2_22
      hmp_pep
      HMPREF1059_03256 hypothetical protein [Parabac...
      28.931
      40
      HMPREF1059_03256 hypothetical protein [Parabac...
      361
      284
    
    
      215
      d9539_asm_v1.2_22
      nr
      putative replication initiation protein [Alist...
      27.386
      61
      putative replication initiation protein [Alist...
      361
      320
    
    
      174
      d9539_asm_v1.2_29
      hmp_pep
      HMPREF1059_03257 hypothetical protein [Parabac...
      26.214
      31
      HMPREF1059_03257 hypothetical protein [Parabac...
      647
      538
    
    
      167
      d9539_asm_v1.2_29
      nr
      putative major capsid protein [Eel River basin...
      28.796
      29
      putative major capsid protein [Eel River basin...
      647
      523
    
    
      168
      d9539_asm_v1.2_29
      nr
      putative uncharacterized protein [Alistipes fi...
      31.746
      27
      putative uncharacterized protein [Alistipes fi...
      647
      338
    
    
      134
      d9539_asm_v1.2_30
      nr
      hypothetical protein [Ruegeria sp. ANG-S4]
      29.545
      87
      hypothetical protein [Ruegeria sp. ANG-S4]
      97
      308
    
    
      135
      d9539_asm_v1.2_30
      nr
      OPT family small oligopeptide transporter, par...
      37.313
      64
      OPT family small oligopeptide transporter, par...
      97
      725
    
    
      136
      d9539_asm_v1.2_30
      nr
      cysteinyl-tRNA synthetase [Chlamydia trachomatis]
      31.522
      93
      cysteinyl-tRNA synthetase [Chlamydia trachomatis]
      97
      531

2. Analyze env_nr hits



In [7]:

    
env_nr_hits = blast_hits[ (blast_hits.db == "env_nr") & (blast_hits.e_value < 1) ]
env_nr_hits["s_description"].apply(lambda x: x.split("[")[1]).drop_duplicates()









    Out[7]:





169    marine metagenome]
Name: s_description, dtype: object

Considering all hits are from marine metagenomes in env_nr, just keep the one with highest bitscore



In [8]:

    
env_nr_hits_filt = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in env_nr_hits.groupby("query_id")).sort_values(by="q_cds_start")
env_nr_hits_filt.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","tool","mism","gap_open"],axis=1)









    Out[8]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      e_value
      bitscore
      q_len
      s_len
      s_gi
      s_taxids
      q_cov
      s_description
      db
      orf_len
      q_cds_start
      q_cds_end
      strand
      q_cov_calc
    
  
  
    
      312
      d9539_asm_v1.2_11
      gi|135497532|gb|EBI02878.1|
      29.752
      121
      9.000000e-01
      36.2
      421
      126
      135497532
      408172.0
      29
      hypothetical protein GOS_9161653, partial [mar...
      env_nr
      421.0
      393
      1655
      +
      28.741093
    
    
      272
      d9539_asm_v1.2_14
      gi|138220043|gb|EBY50077.1|
      32.222
      90
      7.100000e-01
      36.2
      174
      315
      138220043
      408172.0
      51
      hypothetical protein GOS_5591846, partial [mar...
      env_nr
      174.0
      1603
      2124
      +
      51.149425
    
    
      225
      d9539_asm_v1.2_22
      gi|142009119|gb|ECU79481.1|
      25.000
      252
      3.000000e-07
      57.8
      361
      295
      142009119
      408172.0
      64
      hypothetical protein GOS_10492, partial [marin...
      env_nr
      361.0
      2351
      3433
      +
      63.711911
    
    
      169
      d9539_asm_v1.2_29
      gi|142008696|gb|ECU79166.1|
      24.706
      255
      6.490000e-08
      62.0
      647
      505
      142008696
      408172.0
      39
      hypothetical protein GOS_10803 [marine metagen...
      env_nr
      647.0
      3429
      5369
      +
      38.639876

3. Analyse MetaHIT hits



In [22]:

    
metahit_2014 = blast_hits[blast_hits.db == "metahit_2014_pep"]
metahit_2014 = metahit_2014.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","s_gi","s_taxids","tool","mism","gap_open"],axis=1)
print(metahit_2014.shape)

a. Load MetaHIT 2014 annotation



In [13]:

    
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
                     "cohort_origin","phylum","genus","kegg","eggNOG",
                    "sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]

metahit_2014_annot = pd.read_csv("~/tmp/metahit_reanalysis/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)

print(metahit_2014_annot.shape)
metahit_2014_annot.head()









    



(9879896, 14)






    Out[13]:






  
    
      
      gene_id
      gene_name
      gene_length
      gene_completness
      cohort_origin
      phylum
      genus
      kegg
      eggNOG
      sample_freq
      individual_freq
      kegg_categories
      eggnog_fx_categories
      cohort_assembled
    
  
  
    
      0
      1
      T2D-6A_GL0083352
      88230
      Complete
      CHN
      unknown
      unknown
      K01824
      COG5184
      0.224152
      0.236449
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN;USA
    
    
      1
      2
      V1.UC4-5_GL0154511
      88086
      Complete
      EUR
      unknown
      unknown
      K01824
      COG5184
      0.352802
      0.351402
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN;USA
    
    
      2
      3
      MH0198_GL0136826
      67464
      Lack 5'-end
      EUR
      unknown
      unknown
      K01824
      COG5184
      0.281768
      0.293458
      Lipid Metabolism
      Cell cycle control, cell division, chromosome ...
      EUR;CHN
    
    
      3
      4
      V1.FI28_GL0106390
      45267
      Complete
      EUR
      unknown
      unknown
      K03924
      NOG12793
      0.278611
      0.267290
      Metabolism
      Function unknown
      EUR;USA
    
    
      4
      5
      MH0318_GL0144550
      42243
      Complete
      EUR
      unknown
      unknown
      K13730
      COG4886
      0.001579
      0.001869
      Infectious Diseases
      Function unknown
      EUR

b. Annotate hits



In [14]:

    
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()









    



(81, 30)






    Out[14]:






  
    
      
      query_id
      subject_id
      pct_id
      ali_len
      e_value
      bitscore
      q_len
      s_len
      q_cov
      s_description
      ...
      cohort_origin
      phylum
      genus
      kegg
      eggNOG
      sample_freq
      individual_freq
      kegg_categories
      eggnog_fx_categories
      cohort_assembled
    
  
  
    
      0
      d9539_asm_v1.2_5
      MH0409_GL0061975
      91.379
      58
      1.220000e-31
      113.0
      58
      86
      100
      MH0409_GL0061975  [gene]  locus=scaffold117357...
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.005525
      0.005607
      unknown
      unknown
      EUR
    
    
      1
      d9539_asm_v1.2_5
      675950834-stool1_revised_C814893_1_gene71376
      89.655
      58
      1.970000e-31
      112.0
      58
      58
      100
      675950834-stool1_revised_C814893_1_gene71376 s...
      ...
      USA
      unknown
      unknown
      unknown
      unknown
      0.018942
      0.014953
      unknown
      unknown
      USA
    
    
      2
      d9539_asm_v1.2_5
      MH0105_GL0047475
      89.655
      58
      2.880000e-31
      112.0
      58
      86
      100
      MH0105_GL0047475  [gene]  locus=scaffold1014_1...
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.007103
      0.008411
      unknown
      unknown
      EUR
    
    
      3
      d9539_asm_v1.2_5
      MH0203_GL0004312
      89.655
      58
      3.250000e-31
      112.0
      58
      86
      100
      MH0203_GL0004312  [gene]  locus=C2075218_1:153...
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.007893
      0.009346
      unknown
      unknown
      EUR
    
    
      4
      d9539_asm_v1.2_5
      MH0033_GL0005948
      89.655
      58
      4.350000e-31
      112.0
      58
      86
      100
      MH0033_GL0005948  [gene]  locus=scaffold47970_...
      ...
      EUR
      unknown
      unknown
      unknown
      unknown
      0.011050
      0.012150
      unknown
      unknown
      EUR
    
  

5 rows × 30 columns

c. Tax annotation

No results



In [17]:

    
print(metahit_2014_df.shape)
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)









    



(81, 30)
(0, 30)
(0, 30)

d. KEGG



In [20]:

    
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["query_id","db","kegg","kegg_categories"]].drop_duplicates()









    Out[20]:






  
    
      
      query_id
      db
      kegg
      kegg_categories
    
  
  
    
      28
      d9539_asm_v1.2_3
      metahit_2014_pep
      K00936
      Metabolism
    
    
      29
      d9539_asm_v1.2_3
      metahit_2014_pep
      K02058
      Membrane Transport

e. eggNOG



In [21]:

    
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["query_id","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()









    Out[21]:






  
    
      
      query_id
      db
      eggNOG
      eggnog_fx_categories
    
  
  
    
      28
      d9539_asm_v1.2_3
      metahit_2014_pep
      COG0642
      Signal transduction mechanisms
    
    
      29
      d9539_asm_v1.2_3
      metahit_2014_pep
      COG1879
      Carbohydrate transport and metabolism

f. Max observed freqs of ORFs 'homologues' in MetaHIT samples



In [23]:

    
gb = metahit_2014_df[["query_id","individual_freq"]].groupby("query_id")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )

gb = metahit_2014_df[["query_id","sample_freq"]].groupby("query_id")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )

pd.merge(max_individual_freq,max_sample_freq,on="query_id",how="outer")









    Out[23]:






  
    
      
      query_id
      individual_freq
      sample_freq
    
  
  
    
      0
      d9539_asm_v1.2_1
      0.035514
      0.037885
    
    
      1
      d9539_asm_v1.2_11
      0.018692
      0.022099
    
    
      2
      d9539_asm_v1.2_14
      0.014953
      0.019732
    
    
      3
      d9539_asm_v1.2_16
      0.006542
      0.007103
    
    
      4
      d9539_asm_v1.2_22
      0.043925
      0.052092
    
    
      5
      d9539_asm_v1.2_29
      0.009346
      0.011050
    
    
      6
      d9539_asm_v1.2_3
      0.066355
      0.063141
    
    
      7
      d9539_asm_v1.2_30
      0.037383
      0.037885
    
    
      8
      d9539_asm_v1.2_5
      0.024299
      0.024467

f. Write out best MetaHIT hit for each ORF



In [24]:

    
metahit_df = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in metahit_hits.groupby("query_id")).sort_values(by="q_cds_start")
metahit_df[["query_id","pct_id","q_cov"]]
metahit_df[["query_id","pct_id","q_cov"]].to_csv("out/filt_orf_best_metahit_hits.csv",index=False)



In [ ]:

	query_id	db	s_description	pct_id	q_cov	s_description	q_len	s_len
293	d9539_asm_v1.2_11	hmp_pep	BACEGG_02723 hypothetical protein [Bacteroides...	40.323	15	BACEGG_02723 hypothetical protein [Bacteroides...	421	368
292	d9539_asm_v1.2_11	nr	hypothetical protein [Eel River basin pequenov...	27.404	48	hypothetical protein [Eel River basin pequenov...	421	277
295	d9539_asm_v1.2_11	nr	hypothetical protein [Bacteroides eggerthii]	40.323	15	hypothetical protein [Bacteroides eggerthii]	421	368
220	d9539_asm_v1.2_22	hmp_pep	HMPREF1059_03256 hypothetical protein [Parabac...	28.931	40	HMPREF1059_03256 hypothetical protein [Parabac...	361	284
215	d9539_asm_v1.2_22	nr	putative replication initiation protein [Alist...	27.386	61	putative replication initiation protein [Alist...	361	320
174	d9539_asm_v1.2_29	hmp_pep	HMPREF1059_03257 hypothetical protein [Parabac...	26.214	31	HMPREF1059_03257 hypothetical protein [Parabac...	647	538
167	d9539_asm_v1.2_29	nr	putative major capsid protein [Eel River basin...	28.796	29	putative major capsid protein [Eel River basin...	647	523
168	d9539_asm_v1.2_29	nr	putative uncharacterized protein [Alistipes fi...	31.746	27	putative uncharacterized protein [Alistipes fi...	647	338
134	d9539_asm_v1.2_30	nr	hypothetical protein [Ruegeria sp. ANG-S4]	29.545	87	hypothetical protein [Ruegeria sp. ANG-S4]	97	308
135	d9539_asm_v1.2_30	nr	OPT family small oligopeptide transporter, par...	37.313	64	OPT family small oligopeptide transporter, par...	97	725
136	d9539_asm_v1.2_30	nr	cysteinyl-tRNA synthetase [Chlamydia trachomatis]	31.522	93	cysteinyl-tRNA synthetase [Chlamydia trachomatis]	97	531

	query_id	subject_id	pct_id	ali_len	e_value	bitscore	q_len	s_len	s_gi	s_taxids	q_cov	s_description	db	orf_len	q_cds_start	q_cds_end	strand	q_cov_calc
312	d9539_asm_v1.2_11	gi\|135497532\|gb\|EBI02878.1\|	29.752	121	9.000000e-01	36.2	421	126	135497532	408172.0	29	hypothetical protein GOS_9161653, partial [mar...	env_nr	421.0	393	1655	+	28.741093
272	d9539_asm_v1.2_14	gi\|138220043\|gb\|EBY50077.1\|	32.222	90	7.100000e-01	36.2	174	315	138220043	408172.0	51	hypothetical protein GOS_5591846, partial [mar...	env_nr	174.0	1603	2124	+	51.149425
225	d9539_asm_v1.2_22	gi\|142009119\|gb\|ECU79481.1\|	25.000	252	3.000000e-07	57.8	361	295	142009119	408172.0	64	hypothetical protein GOS_10492, partial [marin...	env_nr	361.0	2351	3433	+	63.711911
169	d9539_asm_v1.2_29	gi\|142008696\|gb\|ECU79166.1\|	24.706	255	6.490000e-08	62.0	647	505	142008696	408172.0	39	hypothetical protein GOS_10803 [marine metagen...	env_nr	647.0	3429	5369	+	38.639876

	gene_id	gene_name	gene_length	gene_completness	cohort_origin	phylum	genus	kegg	eggNOG	sample_freq	individual_freq	kegg_categories	eggnog_fx_categories	cohort_assembled
0	1	T2D-6A_GL0083352	88230	Complete	CHN	unknown	unknown	K01824	COG5184	0.224152	0.236449	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN;USA
1	2	V1.UC4-5_GL0154511	88086	Complete	EUR	unknown	unknown	K01824	COG5184	0.352802	0.351402	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN;USA
2	3	MH0198_GL0136826	67464	Lack 5'-end	EUR	unknown	unknown	K01824	COG5184	0.281768	0.293458	Lipid Metabolism	Cell cycle control, cell division, chromosome ...	EUR;CHN
3	4	V1.FI28_GL0106390	45267	Complete	EUR	unknown	unknown	K03924	NOG12793	0.278611	0.267290	Metabolism	Function unknown	EUR;USA
4	5	MH0318_GL0144550	42243	Complete	EUR	unknown	unknown	K13730	COG4886	0.001579	0.001869	Infectious Diseases	Function unknown	EUR

	query_id	subject_id	pct_id	ali_len	e_value	bitscore	q_len	s_len	q_cov	s_description	...	cohort_origin	phylum	genus	kegg	eggNOG	sample_freq	individual_freq	kegg_categories	eggnog_fx_categories	cohort_assembled
0	d9539_asm_v1.2_5	MH0409_GL0061975	91.379	58	1.220000e-31	113.0	58	86	100	MH0409_GL0061975 [gene] locus=scaffold117357...	...	EUR	unknown	unknown	unknown	unknown	0.005525	0.005607	unknown	unknown	EUR
1	d9539_asm_v1.2_5	675950834-stool1_revised_C814893_1_gene71376	89.655	58	1.970000e-31	112.0	58	58	100	675950834-stool1_revised_C814893_1_gene71376 s...	...	USA	unknown	unknown	unknown	unknown	0.018942	0.014953	unknown	unknown	USA
2	d9539_asm_v1.2_5	MH0105_GL0047475	89.655	58	2.880000e-31	112.0	58	86	100	MH0105_GL0047475 [gene] locus=scaffold1014_1...	...	EUR	unknown	unknown	unknown	unknown	0.007103	0.008411	unknown	unknown	EUR
3	d9539_asm_v1.2_5	MH0203_GL0004312	89.655	58	3.250000e-31	112.0	58	86	100	MH0203_GL0004312 [gene] locus=C2075218_1:153...	...	EUR	unknown	unknown	unknown	unknown	0.007893	0.009346	unknown	unknown	EUR
4	d9539_asm_v1.2_5	MH0033_GL0005948	89.655	58	4.350000e-31	112.0	58	86	100	MH0033_GL0005948 [gene] locus=scaffold47970_...	...	EUR	unknown	unknown	unknown	unknown	0.011050	0.012150	unknown	unknown	EUR

	query_id	db	kegg	kegg_categories
28	d9539_asm_v1.2_3	metahit_2014_pep	K00936	Metabolism
29	d9539_asm_v1.2_3	metahit_2014_pep	K02058	Membrane Transport

	query_id	db	eggNOG	eggnog_fx_categories
28	d9539_asm_v1.2_3	metahit_2014_pep	COG0642	Signal transduction mechanisms
29	d9539_asm_v1.2_3	metahit_2014_pep	COG1879	Carbohydrate transport and metabolism

	query_id	individual_freq	sample_freq
0	d9539_asm_v1.2_1	0.035514	0.037885
1	d9539_asm_v1.2_11	0.018692	0.022099
2	d9539_asm_v1.2_14	0.014953	0.019732
3	d9539_asm_v1.2_16	0.006542	0.007103
4	d9539_asm_v1.2_22	0.043925	0.052092
5	d9539_asm_v1.2_29	0.009346	0.011050
6	d9539_asm_v1.2_3	0.066355	0.063141
7	d9539_asm_v1.2_30	0.037383	0.037885
8	d9539_asm_v1.2_5	0.024299	0.024467