In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import re
import itertools
import matplotlib_venn
In [3]:
#Load blast best hits
filt_blast_hits = pd.read_csv("filtered_blast_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("filtered_hmmer_hits.csv")
print(filt_hmmer_hits.shape)
all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("cluster_stats.csv")
In [4]:
print(all_filt_hits.shape)
all_filt_hits.head()
Out[4]:
In [5]:
print(clusters.shape)
clusters.head()
Out[5]:
In [8]:
tmp = [(cluster,hits.shape[0],";".join(hits.db)) for cluster,hits in ( all_filt_hits[["cluster","db"]].drop_duplicates().groupby("cluster") )]
hits_x_cluster = pd.DataFrame.from_records(tmp,columns=["cluster","db_hit_count","dbs"])
print("Cluster families with no annotation: {}".format(clusters.shape[0]-hits_x_cluster.shape[0]))
hits_x_cluster = hits_x_cluster.merge(clusters,left_on="cluster",right_on="Cluster")[["cluster","db_hit_count","dbs","sample_origin"]]
hits_x_cluster.sort_values(by="db_hit_count",ascending=False).head()
hits_x_cluster.to_csv("hits_per_cluster.csv",index=False)
In [7]:
all_filt_hits[["cluster","db"]].drop_duplicates().groupby("db").count()
Out[7]:
In [23]:
metahit_cds_list = set(all_filt_hits[all_filt_hits.db == "metahit_cds"]["cluster"].drop_duplicates().tolist())
metahit_pep_list = set(all_filt_hits[all_filt_hits.db == "metahit_pep"]["cluster"].drop_duplicates().tolist())
env_nt_list = set(all_filt_hits[all_filt_hits.db == "env_nt"]["cluster"].drop_duplicates().tolist())
In [31]:
for x in env_nt_list:
print(x)
In [ ]: