In [1]:
!head 454_seqs_kaiju.names.virus.txt


C	FTSPZO101CHNQ7	196894	Viruses; Siphoviridae; NA; 
C	DBA-SLE_c5570	1341019	Viruses; Parvoviridae; Parvovirus NIH-CQV; 
C	GB3LKKR01ED1RA	1792245	Viruses; Myoviridae; Bacillus virus Deepblue; 
C	GB3LKKR01DUZ77	948870	Viruses; Myoviridae; Enterobacteria phage phi92; 
C	FTSPZO101E24BN	1608451	Viruses; NA; Phytophthora parasitica virus; 
C	FTSPZO101CESZR	1685734	Viruses; NA; Lake Sarah-associated circular molecule 9; 
C	is_serum_s3990	1608451	Viruses; NA; Phytophthora parasitica virus; 
C	GB3LKKR01D8JD7	1655645	Viruses; Microviridae; Parabacteroides phage YZ-2015b; 
C	FS22EC101DGI0A	1458716	Viruses; Podoviridae; Vibrio phage CHOED; 
C	GB3LKKR01A2IV8	1655645	Viruses; Microviridae; Parabacteroides phage YZ-2015b; 

In [2]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import editdistance
import itertools

In [3]:
#The file stores the annotation for each read with some Virus annotation
viruses = []
with open("454_seqs_kaiju.names.virus.txt") as fh:
    for line in fh:
        viruses.append([x.rstrip(" \t").lstrip(" \t") for x in line.rstrip("\n").split(";")[1:-1]])
        
read_annot = pd.DataFrame.from_records(viruses,columns=["family","species"])
read_annot.head()


Out[3]:
family species
0 Siphoviridae NA
1 Parvoviridae Parvovirus NIH-CQV
2 Myoviridae Bacillus virus Deepblue
3 Myoviridae Enterobacteria phage phi92
4 NA Phytophthora parasitica virus

In [4]:
read_annot["species_fillna"] = read_annot.apply(lambda r: "unclassified {}".format(r["family"]) if r["species"] == "NA" else r["species"],axis=1)
read_annot.head()


Out[4]:
family species species_fillna
0 Siphoviridae NA unclassified Siphoviridae
1 Parvoviridae Parvovirus NIH-CQV Parvovirus NIH-CQV
2 Myoviridae Bacillus virus Deepblue Bacillus virus Deepblue
3 Myoviridae Enterobacteria phage phi92 Enterobacteria phage phi92
4 NA Phytophthora parasitica virus Phytophthora parasitica virus

Combine similar annotations into a single 'clade'

E.g Enterobacteria phage phi92 and phi91 should be grouped


In [5]:
editdist_sp = [ (sp1,sp2,editdistance.eval(sp1,sp2)) for sp1,sp2 in itertools.combinations(read_annot["species_fillna"].unique(),2) ]
editdist_df = pd.DataFrame.from_records(editdist_sp,columns=["sp1","sp2","edit_distance"])
editdist_df["similarity"] = editdist_df.apply(lambda r: (max(len(r["sp1"]),len(r["sp2"]))-r["edit_distance"])/ max(len(r["sp1"]),len(r["sp2"])),axis=1)
editdist_df.head()


Out[5]:
sp1 sp2 edit_distance similarity
0 unclassified Siphoviridae Parvovirus NIH-CQV 23 0.080000
1 unclassified Siphoviridae Bacillus virus Deepblue 21 0.160000
2 unclassified Siphoviridae Enterobacteria phage phi92 22 0.153846
3 unclassified Siphoviridae Phytophthora parasitica virus 26 0.103448
4 unclassified Siphoviridae Lake Sarah-associated circular molecule 9 31 0.243902

In [6]:
cluster_membership = {}
clusters = []

#Single linkage clustering
for _,row in editdist_df[editdist_df.similarity > 0.65 ].sort_values("similarity",ascending=False).iterrows():
    if row["sp1"] not in cluster_membership and row["sp2"] not in cluster_membership:
        #Create new cluster and add them both
        clusters.append([row["sp1"],row["sp2"]])
        cluster_membership[row["sp1"]] = len(clusters)-1
        cluster_membership[row["sp2"]] = len(clusters)-1
    elif row["sp1"] in cluster_membership:
        if row["sp2"] not in cluster_membership:
            #Add sp2 to sp1 cluster
            clusters[cluster_membership[row["sp1"]]].append(row["sp2"])
            cluster_membership[row["sp2"]] = cluster_membership[row["sp1"]]
        else:
            if cluster_membership[row["sp1"]] != cluster_membership[row["sp2"]]:
                #Combine clusters!
                c1_pos = cluster_membership[row["sp1"]]
                c2_pos = cluster_membership[row["sp2"]]
                clusters[c1_pos] += clusters[c2_pos]
                for c in clusters[c2_pos]:
                    cluster_membership[c] = c1_pos
                clusters[c2_pos] = None
    else:
        #Row 2 is in cluster and row1 is not
        clusters[cluster_membership[row["sp2"]]].append(row["sp1"])
        cluster_membership[row["sp1"]] = cluster_membership[row["sp2"]]

In [7]:
cluster_names = [";".join(c) if c else None for c in clusters]

Use new clade groups


In [8]:
read_annot["sp_group"] = read_annot["species_fillna"].apply(lambda x: cluster_names[cluster_membership[x]] if x in cluster_membership and "unclassified" not in x else x)
df_counts = read_annot.groupby("sp_group").size().reset_index()
df_counts.columns = ["species","read_count"]
df_counts[df_counts.read_count >= 3].sort_values("read_count",ascending=False)


Out[8]:
species read_count
71 Parabacteroides phage YZ-2015b 134
49 Gokushovirinae Bog5712_52;Gokushovirinae Bog89... 105
41 Enterobacteria phage phi92 56
80 Phytophthora parasitica virus 55
73 Parvovirus NIH-CQV 43
132 uncultured crAssphage 43
131 unclassified Siphoviridae 36
126 unclassified NA 31
32 Croceibacter phage P2559Y 28
84 Pseudomonas phage PMG1;Pseudomonas phage PS-1;... 27
125 unclassified Myoviridae 27
112 Vibrio phage CHOED;Vibrio phage SIO-2;Vibrio p... 27
20 Burkholderia virus BcepF1 20
123 unclassified Microviridae 20
26 Chimpanzee faeces associated microphage 2;Chim... 18
105 Synechococcus phage S-CAM8;Synechococcus phage... 18
96 Sewage-associated gemycircularvirus 1;Sewage-a... 18
94 Salmonella virus SP31;Salmonella virus SPN1S;S... 15
24 Cellulophaga phage phi19:1;Cellulophaga phage ... 13
64 Mycobacterium phage Sparky;Mycobacterium phage... 12
9 Bacillus virus G;Bacillus virus GA1;Bacillus v... 11
129 unclassified Podoviridae 10
25 Changjiang sobemo-like virus 1 9
89 Rhizobium phage 16-3;Rhizobium phage RHEph10;R... 9
128 unclassified Phycodnaviridae 8
88 Rhinovirus A;Rhinovirus C;Rotavirus A 7
63 Microviridae Fen7940_21;Microviridae Fen7918_21 6
70 Pandoravirus dulcis;Pandoravirus salinus;Pando... 6
34 Cronobacter phage S13;Caulobacter phage Cr30;C... 5
62 Marine gokushovirus 4
45 Escherichia virus Rtp;Escherichia virus EC6;Es... 4
16 Brevibacillus phage Sundance 4
50 Gordonia phage GMA4;Gordonia phage GMA6;Gordon... 4
1 Acinetobacter phage phiAC-1;Acinetobacter phag... 4
90 Rhodococcus phage REQ2;Rhodococcus phage RRH1;... 4
86 Ralstonia phage RSK1;Ralstonia phage RSP15;Ral... 4
81 Podovirus Lau218 4
8 Bacillus phage BCD7;Bacillus phage vB_BhaS-171... 4
102 Streptomyces phage phiHau3 3
97 Shewanella sp. phage 1/40;Shewanella sp. phage... 3
114 Vibrio virus VC8;Vibrio virus nt1 3
44 Escherichia phage N4;Escherichia phage phAPEC8 3
3 Aeromonas phage pAh6-C;Aeromonas phage vB_AsaM... 3
65 Nitrincola phage 1M3-16 3

In [9]:
virus_count_histogram = df_counts[df_counts.read_count >= 3].groupby("read_count").size().reset_index()
sns.barplot(x=virus_count_histogram["read_count"],y=virus_count_histogram[0])


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x116723668>

In [12]:
df_counts[df_counts.read_count >= 3].sort_values("read_count",ascending=False).to_csv("454_seqs_kaiju.filt_species.tsv",sep="\t",index=False)

In [14]:
!ls


454_seqs_kaiju.filt_species.tsv  454_seqs_kaiju.species.report
454_seqs_kaiju.genus.report      454_seqs_kaiju.txt
454_seqs_kaiju.names.txt         kaiju_summarize_virus_hits.ipynb
454_seqs_kaiju.names.virus.txt   poophage_in_kaiju.ipynb
454_seqs_kaiju.report            run_kaiju.sh

In [ ]: