In [1]:
import pandas as pd
import re
import itertools
import matplotlib_venn
import numpy as np

1. Load blast hits , hmmer hits for MetaHIT and cluster stats


In [2]:
#Load blast best hits
filt_blast_hits = pd.read_csv("1_out/filtered_blast_all_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("1_out/filtered_hmmer_all_hits.csv")
print(filt_hmmer_hits.shape)

all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("1_out/cluster_stats.csv")
clusters["sample_origin"] = clusters["sample_origin"].apply(lambda x:x.replace(",",";"))


(3400, 23)
(1882, 22)

In [3]:
#Get MetaHIT 2014 IGC hits
metahit_2014 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" in x))]
print(metahit_2014.shape)
metahit_2014.head()


(3389, 37)
Out[3]:
ali_len best_dmn_bias best_dmn_bitscore best_dmn_e_value bias bitscore cluster db dne_clu dne_dom ... s_description s_end s_gi s_len s_names s_scinames s_start s_taxids subject_id tool
1566 270.0 NaN NaN NaN NaN 431.0 1073 metahit_2014_cds NaN NaN ... MH0019_GL0030364 [gene] locus=scaffold39654_... 35.0 0.0 549.0 NaN NaN 304.0 NaN MH0019_GL0030364 blastn
1567 270.0 NaN NaN NaN NaN 420.0 1073 metahit_2014_cds NaN NaN ... 765620695-stool1_revised_scaffold38964_2_gene1... 35.0 0.0 546.0 NaN NaN 304.0 NaN 765620695-stool1_revised_scaffold38964_2_gene1... blastn
1568 263.0 NaN NaN NaN NaN 254.0 1073 metahit_2014_cds NaN NaN ... MH0048_GL0036718 [gene] locus=scaffold33886_... 35.0 0.0 537.0 NaN NaN 294.0 NaN MH0048_GL0036718 blastn
1569 274.0 NaN NaN NaN NaN 409.0 1073 metahit_2014_cds NaN NaN ... V1.CD14-0_GL0044222 [gene] locus=scaffold182... 45.0 0.0 567.0 NaN NaN 318.0 NaN V1.CD14-0_GL0044222 blastn
1570 259.0 NaN NaN NaN NaN 401.0 1073 metahit_2014_cds NaN NaN ... V1.CD6-0-PT_GL0032215 [gene] locus=scaffold1... 42.0 0.0 564.0 NaN NaN 300.0 NaN V1.CD6-0-PT_GL0032215 blastn

5 rows × 37 columns


In [4]:
#Get MetaHIT 2010 hits
metahit_2010 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" not in x))]
print(metahit_2010.shape)
metahit_2010.head()


(1206, 37)
Out[4]:
ali_len best_dmn_bias best_dmn_bitscore best_dmn_e_value bias bitscore cluster db dne_clu dne_dom ... s_description s_end s_gi s_len s_names s_scinames s_start s_taxids subject_id tool
536 270.0 NaN NaN NaN NaN 431.0 1073 metahit_cds NaN NaN ... GL0053633_MH0019_[Complete]_[mRNA]_locus=scaff... 35.0 0.0 549.0 NaN NaN 304.0 NaN GL0053633_MH0019_[Complete]_[mRNA]_locus=scaff... blastn
537 259.0 NaN NaN NaN NaN 412.0 1073 metahit_cds NaN NaN ... GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 45.0 0.0 555.0 NaN NaN 303.0 NaN GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... blastn
538 270.0 NaN NaN NaN NaN 243.0 1073 metahit_cds NaN NaN ... GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3... 51.0 0.0 561.0 NaN NaN 315.0 NaN GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3... blastn
539 273.0 NaN NaN NaN NaN 444.0 1073 metahit_cds NaN NaN ... GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... 31.0 0.0 555.0 NaN NaN 303.0 NaN GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... blastn
540 283.0 NaN NaN NaN NaN 272.0 1073 metahit_cds NaN NaN ... GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3... 38.0 0.0 561.0 NaN NaN 315.0 NaN GL0144339_V1_[Complete]_[mRNA]_locus=scaffold3... blastn

5 rows × 37 columns


In [5]:
print(clusters.shape)
clusters.head()


(32, 23)
Out[5]:
Cluster # nr seqs # raw seqs Score log RNAcode score P ORF length (aa) log ORF length complexity min ... min.1 avg.1 max.1 # seqs with max length composite score DNA alignment protein alignment RNAcode figure cluster_id sample_origin
0 1217 6 6 41.974 1.62 7.704e–10 91 1.96 hi 3.79 ... 38 73.67 87 2 7.57 <a href="all_nt_aln/cluster1217_DNA.aln.txt">D... <a href="all_aa_aln/cluster1217_AA.aln.txt">AA... <a href="rnacode_all/cluster1217_eps/hss-0.eps... 1217 feces
1 339b 7 11 33.153 1.52 1.069e–06 110 2.04 hi 3.89 ... 63 79.14 106 2 7.52 <a href="all_nt_aln/cluster339b_DNA.aln.txt">D... <a href="all_aa_aln/cluster339b_AA.aln.txt">AA... <a href="rnacode_all/cluster339b_eps/hss-0.eps... 339b feces
2 211b 7 22 42.949 1.63 3.364e–14 80 1.90 hi 3.72 ... 31 62.00 73 2 7.49 <a href="all_nt_aln/cluster211b_DNA.aln.txt">D... <a href="all_aa_aln/cluster211b_AA.aln.txt">AA... <a href="rnacode_all/cluster211b_eps/hss-0.eps... 211b feces
3 562 7 9 39.417 1.60 4.592e–07 61 1.79 hi 3.78 ... 29 49.57 59 8 7.39 <a href="all_nt_aln/cluster562_DNA.aln.txt">DN... <a href="all_aa_aln/cluster562_AA.aln.txt">AA</a> <a href="rnacode_all/cluster562_eps/hss-0.eps"... 562 CSF;serum
4 297a 5 14 26.891 1.43 0.001 106 2.03 hi 3.90 ... 59 96.60 106 8 7.38 <a href="all_nt_aln/cluster297a_DNA.aln.txt">D... <a href="all_aa_aln/cluster297a_AA.aln.txt">AA... <a href="rnacode_all/cluster297a_eps/hss-0.eps... 297a feces

5 rows × 23 columns

Load Metahit annotations


In [6]:
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
                     "cohort_origin","phylum","genus","kegg","eggNOG",
                    "sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]

metahit_2014_annot = pd.read_csv("~/resources/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)

print(metahit_2014_annot.shape)
metahit_2014_annot.head()


(9879896, 14)
Out[6]:
gene_id gene_name gene_length gene_completness cohort_origin phylum genus kegg eggNOG sample_freq individual_freq kegg_categories eggnog_fx_categories cohort_assembled
0 1 T2D-6A_GL0083352 88230 Complete CHN unknown unknown K01824 COG5184 0.224152 0.236449 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN;USA
1 2 V1.UC4-5_GL0154511 88086 Complete EUR unknown unknown K01824 COG5184 0.352802 0.351402 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN;USA
2 3 MH0198_GL0136826 67464 Lack 5'-end EUR unknown unknown K01824 COG5184 0.281768 0.293458 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN
3 4 V1.FI28_GL0106390 45267 Complete EUR unknown unknown K03924 NOG12793 0.278611 0.267290 Metabolism Function unknown EUR;USA
4 5 MH0318_GL0144550 42243 Complete EUR unknown unknown K13730 COG4886 0.001579 0.001869 Infectious Diseases Function unknown EUR

3. Annotate hits with metadata


In [7]:
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()


(3389, 51)
Out[7]:
ali_len best_dmn_bias best_dmn_bitscore best_dmn_e_value bias bitscore cluster db dne_clu dne_dom ... cohort_origin phylum genus kegg eggNOG sample_freq individual_freq kegg_categories eggnog_fx_categories cohort_assembled
0 270.0 NaN NaN NaN NaN 431.0 1073 metahit_2014_cds NaN NaN ... EUR unknown unknown unknown unknown 0.041831 0.045794 unknown unknown EUR;CHN;USA
1 270.0 NaN NaN NaN NaN 420.0 1073 metahit_2014_cds NaN NaN ... USA unknown unknown unknown unknown 0.014996 0.015888 unknown unknown USA
2 263.0 NaN NaN NaN NaN 254.0 1073 metahit_2014_cds NaN NaN ... EUR unknown unknown unknown unknown 0.116811 0.111215 unknown unknown EUR
3 274.0 NaN NaN NaN NaN 409.0 1073 metahit_2014_cds NaN NaN ... EUR unknown unknown unknown unknown 0.014996 0.017757 unknown unknown EUR;CHN;USA
4 259.0 NaN NaN NaN NaN 401.0 1073 metahit_2014_cds NaN NaN ... EUR unknown unknown unknown unknown 0.021310 0.024299 unknown unknown EUR

5 rows × 51 columns

MetaHIT 2014

a. Tax annotation

Apparently there is no tax annotation for the hits we have in MetaHIT :D


In [8]:
metahit_2014_df.shape
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)


(0, 51)
(0, 51)

b. KEGG


In [9]:
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["cluster","tool","db","kegg","kegg_categories"]].drop_duplicates()


Out[9]:
cluster tool db kegg kegg_categories
619 348 blastn metahit_2014_cds K02314 Replication and Repair
1504 348 blastp metahit_2014_pep K02314 Replication and Repair
2897 348 hmmsearch metahit_2014_pep K02314 Replication and Repair
3234 532 hmmsearch metahit_2014_pep K06915 Poorly Characterized

c. eggNOG


In [10]:
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["cluster","tool","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()


Out[10]:
cluster tool db eggNOG eggnog_fx_categories
619 348 blastn metahit_2014_cds COG0305 Replication, recombination and repair
1504 348 blastp metahit_2014_pep COG0305 Replication, recombination and repair
1882 1085 hmmsearch metahit_2014_pep NOG269643 unknown
2897 348 hmmsearch metahit_2014_pep COG0305 Replication, recombination and repair
3234 532 hmmsearch metahit_2014_pep COG0433 General function prediction only

d. Individual and sample frequency in the MetaHIT samples


In [11]:
gb = metahit_2014_df[["cluster","individual_freq"]].groupby("cluster")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )

gb = metahit_2014_df[["cluster","sample_freq"]].groupby("cluster")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )

pd.merge(max_individual_freq,max_sample_freq,on="cluster",how="outer")


Out[11]:
cluster individual_freq sample_freq
0 1073 0.295327 0.298343
1 1085 0.004673 0.004736
2 113b 0.029907 0.025257
3 1217 0.228972 0.220205
4 179a 0.024299 0.025257
5 179b 0.024299 0.025257
6 182a 0.027103 0.026835
7 182b 0.027103 0.026835
8 211a 0.024299 0.023678
9 211b 0.024299 0.023678
10 241a 0.024299 0.023678
11 241b 0.034579 0.033938
12 258 0.035514 0.037885
13 297a 0.043925 0.052092
14 297b 0.043925 0.052092
15 321b 0.018692 0.022099
16 339b 0.019626 0.022099
17 348 0.210280 0.217048
18 375a 0.024299 0.025257
19 406b 0.168224 0.161800
20 502 0.024299 0.023678
21 532 0.121495 0.127072
22 589 0.017757 0.018942
23 612a 0.252336 0.257301
24 787b 0.044860 0.056038

In [ ]: