In [1]:
import pandas as pd
import re
import itertools
import matplotlib_venn
import numpy as np
In [2]:
#Load blast best hits
filt_blast_hits = pd.read_csv("1_out/filtered_blast_all_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("1_out/filtered_hmmer_all_hits.csv")
print(filt_hmmer_hits.shape)
all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("1_out/cluster_stats.csv")
clusters["sample_origin"] = clusters["sample_origin"].apply(lambda x:x.replace(",",";"))
In [3]:
#Get MetaHIT 2014 IGC hits
metahit_2014 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" in x))]
print(metahit_2014.shape)
metahit_2014.head()
Out[3]:
In [4]:
#Get MetaHIT 2010 hits
metahit_2010 = all_filt_hits[all_filt_hits.db.apply(lambda x: ("metahit" in x) and ("2014" not in x))]
print(metahit_2010.shape)
metahit_2010.head()
Out[4]:
In [5]:
print(clusters.shape)
clusters.head()
Out[5]:
In [6]:
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
"cohort_origin","phylum","genus","kegg","eggNOG",
"sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]
metahit_2014_annot = pd.read_csv("~/resources/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)
print(metahit_2014_annot.shape)
metahit_2014_annot.head()
Out[6]:
In [7]:
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()
Out[7]:
In [8]:
metahit_2014_df.shape
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)
In [9]:
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["cluster","tool","db","kegg","kegg_categories"]].drop_duplicates()
Out[9]:
In [10]:
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["cluster","tool","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()
Out[10]:
In [11]:
gb = metahit_2014_df[["cluster","individual_freq"]].groupby("cluster")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )
gb = metahit_2014_df[["cluster","sample_freq"]].groupby("cluster")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )
pd.merge(max_individual_freq,max_sample_freq,on="cluster",how="outer")
Out[11]:
In [ ]: