In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import re
import itertools
import matplotlib_venn

In [3]:
#Load blast best hits
filt_blast_hits = pd.read_csv("filtered_blast_hits.csv")
print(filt_blast_hits.shape)
#Load hmmer best hits
filt_hmmer_hits = pd.read_csv("filtered_hmmer_hits.csv")
print(filt_hmmer_hits.shape)

all_filt_hits = pd.concat([filt_blast_hits,filt_hmmer_hits],axis=0)
#Load cluster stats
clusters = pd.read_csv("cluster_stats.csv")


(66, 11)
(29, 9)

In [4]:
print(all_filt_hits.shape)
all_filt_hits.head()


(95, 12)
Out[4]:
best_dmn_e_value bitscore cluster db e_value pct_id q_cov q_len query_id s_description subject_id tool
0 NaN 479.0 1073 env_nt 1.000000e-131 98.180 99.0 276.0 GB3LKKR01A150I gut metagenome genome assembly P2E0-k21-2014-0... gi|936108378|emb|CEAB01076172.1| blastn
1 NaN 444.0 1073 metahit_cds 1.010000e-122 95.971 96.0 285.0 contig07331 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... blastn
2 NaN 159.0 1073 metahit_pep 3.240000e-48 97.727 96.0 92.0 contig07331 GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff... blastp
3 NaN 165.0 113b metahit_cds 2.240000e-39 98.913 100.0 92.0 GB3LKKR01C5IFF GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324... GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324... blastn
4 NaN 100.0 113b env_nt 3.000000e-18 87.360 89.0 98.0 GB3LKKR02F10X4 Gut metagenome Scaffold6187_1, whole genome sh... gi|557595202|gb|AVOA01007617.1| blastn

In [5]:
print(clusters.shape)
clusters.head()


(32, 23)
Out[5]:
Cluster # nr seqs # raw seqs Score log RNAcode score P ORF length (aa) log ORF length complexity min ... min.1 avg.1 max.1 # seqs with max length composite score DNA alignment protein alignment RNAcode figure cluster_id sample_origin
0 1217 6 6 41.974 1.62 7.704e–10 91 1.96 hi 3.79 ... 38 73.67 87 2 7.57 <a href="all_nt_aln/cluster1217_DNA.aln.txt">D... <a href="all_aa_aln/cluster1217_AA.aln.txt">AA... <a href="rnacode_all/cluster1217_eps/hss-0.eps... 1217 feces
1 339b 7 11 33.153 1.52 1.069e–06 110 2.04 hi 3.89 ... 63 79.14 106 2 7.52 <a href="all_nt_aln/cluster339b_DNA.aln.txt">D... <a href="all_aa_aln/cluster339b_AA.aln.txt">AA... <a href="rnacode_all/cluster339b_eps/hss-0.eps... 339b feces
2 211b 7 22 42.949 1.63 3.364e–14 80 1.90 hi 3.72 ... 31 62.00 73 2 7.49 <a href="all_nt_aln/cluster211b_DNA.aln.txt">D... <a href="all_aa_aln/cluster211b_AA.aln.txt">AA... <a href="rnacode_all/cluster211b_eps/hss-0.eps... 211b feces
3 562 7 9 39.417 1.60 4.592e–07 61 1.79 hi 3.78 ... 29 49.57 59 8 7.39 <a href="all_nt_aln/cluster562_DNA.aln.txt">DN... <a href="all_aa_aln/cluster562_AA.aln.txt">AA</a> <a href="rnacode_all/cluster562_eps/hss-0.eps"... 562 CSF,serum
4 297a 5 14 26.891 1.43 0.001 106 2.03 hi 3.90 ... 59 96.60 106 8 7.38 <a href="all_nt_aln/cluster297a_DNA.aln.txt">D... <a href="all_aa_aln/cluster297a_AA.aln.txt">AA... <a href="rnacode_all/cluster297a_eps/hss-0.eps... 297a feces

5 rows × 23 columns

Stats

Hits per cluster


In [8]:
tmp = [(cluster,hits.shape[0],";".join(hits.db)) for cluster,hits in ( all_filt_hits[["cluster","db"]].drop_duplicates().groupby("cluster") )]
hits_x_cluster = pd.DataFrame.from_records(tmp,columns=["cluster","db_hit_count","dbs"])
print("Cluster families with no annotation: {}".format(clusters.shape[0]-hits_x_cluster.shape[0]))
hits_x_cluster = hits_x_cluster.merge(clusters,left_on="cluster",right_on="Cluster")[["cluster","db_hit_count","dbs","sample_origin"]]
hits_x_cluster.sort_values(by="db_hit_count",ascending=False).head()
hits_x_cluster.to_csv("hits_per_cluster.csv",index=False)


Cluster families with no annotation: 3

Hits per database


In [7]:
all_filt_hits[["cluster","db"]].drop_duplicates().groupby("db").count()


Out[7]:
cluster
db
env_nr 2
env_nt 26
hmp_pep 1
metahit_cds 18
metahit_pep 22
nr 4

In [23]:
metahit_cds_list = set(all_filt_hits[all_filt_hits.db == "metahit_cds"]["cluster"].drop_duplicates().tolist())
metahit_pep_list = set(all_filt_hits[all_filt_hits.db == "metahit_pep"]["cluster"].drop_duplicates().tolist())
env_nt_list = set(all_filt_hits[all_filt_hits.db == "env_nt"]["cluster"].drop_duplicates().tolist())

In [31]:
for x in env_nt_list:
    print(x)


241a
540
1073
348
211b
457
612a
339b
182a
211a
532
258
321b
1217
297a
406b
589
787b
182b
179b
297b
179a
241b
502
375a
113b

In [ ]: