In [1]:
!tree


.
├── 1_get_hits_from_tools.ipynb
├── 1_merged_viruses.tsv
├── 2_set_intersection_viz.R
├── README.md
├── find_viruses_in_families
│   ├── 0_virus_fasta
│   │   ├── blastdb
│   │   │   ├── viruses.fasta
│   │   │   ├── viruses.nhr
│   │   │   ├── viruses.nin
│   │   │   └── viruses.nsq
│   │   ├── crassphage.fasta
│   │   ├── enterobacteria_phage_phi92.fasta
│   │   └── parvovirus-nih_cqv.fasta
│   ├── 1_blastn
│   │   ├── asn
│   │   │   └── orfan_to_viruses.asn
│   │   ├── plots
│   │   └── tsv
│   │       ├── blast_tsv_columns.txt
│   │       ├── orfan_to_viruses.tsv
│   │       └── orfan_to_viruses_filt.tsv
│   ├── 1_orfans_to_viruses.sh
│   ├── blastn.mak
│   ├── plot_blastn_hits.py
│   └── plot_blastn_tsv_hits.py
├── kaiju
│   ├── kaiju_greedy
│   │   ├── 454_seqs_kaiju_greedy.family.report
│   │   ├── 454_seqs_kaiju_greedy.filt_species.tsv
│   │   ├── 454_seqs_kaiju_greedy.genus.report
│   │   ├── 454_seqs_kaiju_greedy.names.txt
│   │   ├── 454_seqs_kaiju_greedy.names.virus.txt
│   │   ├── 454_seqs_kaiju_greedy.species.report
│   │   ├── 454_seqs_kaiju_greedy.txt
│   │   ├── kaiju_greedy_summarize_virus_hits.ipynb
│   │   └── summarize_virus_hits.ipynb
│   ├── kaiju_mem
│   │   ├── 454_seqs_kaiju.genus.report
│   │   ├── 454_seqs_kaiju.names.txt
│   │   ├── 454_seqs_kaiju.names.virus.txt
│   │   ├── 454_seqs_kaiju.report
│   │   ├── 454_seqs_kaiju.species.report
│   │   ├── 454_seqs_kaiju.txt
│   │   └── kaiju_mem_summarize_virus_hits.ipynb
│   ├── poophage_in_kaiju.ipynb
│   ├── run_kaiju_greedy.sh
│   └── run_kaiju_mem.sh
├── kraken
│   ├── 454_seqs_kraken.out
│   ├── 454_seqs_kraken_filt.out
│   ├── 454_seqs_kraken_filt.report
│   ├── _filt.virus.report
│   ├── kraken.log
│   └── run_kraken.sh
└── metaphlan2
    ├── 454_reads_mpa2.bowtie2.bz2
    ├── 454_reads_mpa2.sam
    ├── 454_reads_mpa2.txt
    ├── get_marker_taxon_annot.ipynb
    ├── markers_to_taxons.csv
    └── run_mpa2.sh

12 directories, 50 files

In [2]:
import pandas as pd

In [3]:
kraken = pd.read_csv("kraken/_filt.virus.report",sep="\t",header=None,names=["pct_sum","reads_sum","reads_assigned","tax_level","taxid","taxname"])
kraken["taxname"] = kraken["taxname"].apply(lambda x: x.lstrip("\t ").rstrip("\t "))
kraken[kraken.reads_assigned >0]


Out[3]:
pct_sum reads_sum reads_assigned tax_level taxid taxname
5 0.0 3 3 S 1341019 Parvovirus NIH-CQV
8 0.0 1 1 S 93678 TTV-like mini virus
11 0.0 2 2 S 1211417 uncultured phage crAssphage
16 0.0 1 1 S 948870 Enterobacteria phage phi92
18 0.0 1 1 - 196894 unclassified Siphoviridae

In [4]:
kaiju = pd.read_csv("kaiju/kaiju_greedy/454_seqs_kaiju_greedy.filt_species.tsv",sep="\t")
print(kaiju.shape)
kaiju


(16, 2)
Out[4]:
species read_count
0 Enterobacteria phage phi92 52
1 uncultured crAssphage 34
2 Parabacteroides phage YZ-2015b;Parabacteroides... 24
3 Parvovirus NIH-CQV 18
4 Phytophthora parasitica virus 18
5 Gokushovirinae Fen672_31;Gokushovirinae Fen787... 10
6 Sewage-associated gemycircularvirus 11;Sewage-... 5
7 Chimpanzee faeces associated microphage 2;Chim... 4
8 Salmonella virus SP31 4
9 unclassified NA 4
10 Pseudomonas virus NP1 3
11 Rhizobium phage RHEph10;Rhizobium phage vB_Rgl... 3
12 Croceibacter phage P2559Y 2
13 Pseudomonas phage PAJU2;Pseudomonas phage phiP... 2
14 TTV-like mini virus 2
15 unclassified Siphoviridae 2

In [5]:
mpa2 = pd.read_csv("metaphlan2/454_reads_mpa2.txt",sep="\t")
mpa2.columns = ["full_tax","rel_ab"]
mpa2["clade"] = mpa2["full_tax"].apply(lambda t: t.split("|")[-1])
mpa2_sp = mpa2[mpa2["clade"].apply(lambda c: c.startswith("s__"))].copy()
mpa2_sp


Out[5]:
full_tax rel_ab clade
6 k__Viruses|p__Viruses_noname|c__Viruses_noname... 100.0 s__Parvovirus_NIH_CQV

Homogenize tool output


In [6]:
import functools
replacements = {("crassphage",):"uncultured crAssphage",
                ("parvovirus","nih","cqv"): "Parvovirus NIH-CQV"
               }

def replacement_fx(clade_name):
    new_name = clade_name
    for r in replacements:
        name_matches = functools.reduce(lambda x,y:x and y, [word in clade_name.lower() for word in r ])
        if name_matches:
            new_name = replacements[r]
            break
    return new_name

In [7]:
mpa2_sp["std_cladename"] = mpa2_sp["clade"].apply(replacement_fx)
mpa2_sp = mpa2_sp[["std_cladename","rel_ab"]].copy()
mpa2_sp.columns = ["clade","abundance"]
mpa2_sp["tool"] = "metaphlan2"

In [8]:
kraken["std_cladename"] = kraken["taxname"].apply(replacement_fx)
kraken = kraken[kraken.reads_assigned >0][["std_cladename","reads_assigned"]].copy()
kraken.columns = ["clade","abundance"]
kraken["tool"] = "kraken"

In [9]:
kaiju["std_cladename"] = kaiju["species"].apply(replacement_fx)
kaiju = kaiju[["std_cladename","read_count"]].copy()
kaiju.columns = ["clade","abundance"]
kaiju["tool"] = "kaiju"

In [10]:
orfan_hits = [["TTV-like mini virus",1],
              ["Phytophthora parasitica virus",1],
              ["uncultured POOphage",1]]

orfan_method = pd.DataFrame.from_records(orfan_hits,columns=["clade","abundance"])
orfan_method["tool"] = "ORFan"

In [11]:
merged_viruses = pd.concat([mpa2_sp,kraken,kaiju,orfan_method],ignore_index=True,axis=0)
merged_viruses.to_csv("1_merged_viruses.tsv",sep="\t",index=False)
merged_viruses


Out[11]:
clade abundance tool
0 Parvovirus NIH-CQV 100.0 metaphlan2
1 Parvovirus NIH-CQV 3.0 kraken
2 TTV-like mini virus 1.0 kraken
3 uncultured crAssphage 2.0 kraken
4 Enterobacteria phage phi92 1.0 kraken
5 unclassified Siphoviridae 1.0 kraken
6 Enterobacteria phage phi92 52.0 kaiju
7 uncultured crAssphage 34.0 kaiju
8 Parabacteroides phage YZ-2015b;Parabacteroides... 24.0 kaiju
9 Parvovirus NIH-CQV 18.0 kaiju
10 Phytophthora parasitica virus 18.0 kaiju
11 Gokushovirinae Fen672_31;Gokushovirinae Fen787... 10.0 kaiju
12 Sewage-associated gemycircularvirus 11;Sewage-... 5.0 kaiju
13 Chimpanzee faeces associated microphage 2;Chim... 4.0 kaiju
14 Salmonella virus SP31 4.0 kaiju
15 unclassified NA 4.0 kaiju
16 Pseudomonas virus NP1 3.0 kaiju
17 Rhizobium phage RHEph10;Rhizobium phage vB_Rgl... 3.0 kaiju
18 Croceibacter phage P2559Y 2.0 kaiju
19 Pseudomonas phage PAJU2;Pseudomonas phage phiP... 2.0 kaiju
20 TTV-like mini virus 2.0 kaiju
21 unclassified Siphoviridae 2.0 kaiju
22 TTV-like mini virus 1.0 ORFan
23 Phytophthora parasitica virus 1.0 ORFan
24 uncultured POOphage 1.0 ORFan

In [ ]: