In [1]:
    
%matplotlib inline
    
In [2]:
    
import pandas as pd
import re
import itertools
import matplotlib_venn
    
In [3]:
    
#Load blast best hits
filt_blast_hits = pd.read_csv("filtered_blast_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("filtered_hmmer_hits.csv")
print(filt_hmmer_hits.shape)
all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("cluster_stats.csv")
    
    
In [4]:
    
print(all_filt_hits.shape)
all_filt_hits.head()
    
    
    Out[4]:
In [5]:
    
print(clusters.shape)
clusters.head()
    
    
    Out[5]:
In [8]:
    
tmp = [(cluster,hits.shape[0],";".join(hits.db)) for cluster,hits in ( all_filt_hits[["cluster","db"]].drop_duplicates().groupby("cluster") )]
hits_x_cluster = pd.DataFrame.from_records(tmp,columns=["cluster","db_hit_count","dbs"])
print("Cluster families with no annotation: {}".format(clusters.shape[0]-hits_x_cluster.shape[0]))
hits_x_cluster = hits_x_cluster.merge(clusters,left_on="cluster",right_on="Cluster")[["cluster","db_hit_count","dbs","sample_origin"]]
hits_x_cluster.sort_values(by="db_hit_count",ascending=False).head()
hits_x_cluster.to_csv("hits_per_cluster.csv",index=False)
    
    
In [7]:
    
all_filt_hits[["cluster","db"]].drop_duplicates().groupby("db").count()
    
    Out[7]:
In [23]:
    
metahit_cds_list = set(all_filt_hits[all_filt_hits.db == "metahit_cds"]["cluster"].drop_duplicates().tolist())
metahit_pep_list = set(all_filt_hits[all_filt_hits.db == "metahit_pep"]["cluster"].drop_duplicates().tolist())
env_nt_list = set(all_filt_hits[all_filt_hits.db == "env_nt"]["cluster"].drop_duplicates().tolist())
    
In [31]:
    
for x in env_nt_list:
    print(x)
    
    
In [ ]: