In [2]:
!ls ../../1_find_poophage/2_hmmer/


1_extract_poophage_reads.sh	454_seqs_poophage_PfamA.tbl
2_hmmscan_to_pfam.sh		454_seqs_poophage_pVOG.domtbl
2_hmmscan_to_pvog.sh		454_seqs_poophage_pVOG.out
2_hmmscan_to_vogdb.sh		454_seqs_poophage_pVOG.tbl
454_seqs_poophage.fa		454_seqs_poophage_vogdb.domtbl
454_seqs_poophage_PfamA.domtbl	454_seqs_poophage_vogdb.out
454_seqs_poophage_PfamA.out	454_seqs_poophage_vogdb.tbl

In [9]:
import pandas as pd

Aim:

See how many poophage sequences are classified as viral by kaiju


In [26]:
#1. Load poophage seqids
poophage_seqs = []
with open("../../1_find_poophage/2_hmmer/454_seqs_poophage.fa") as fh:
    for line in fh:
        if line.startswith(">"):
            poophage_seqs.append(line.rstrip("\n").lstrip(">"))
            
print(len(poophage_seqs))
poophage_seqs = frozenset(poophage_seqs)


1147

In [23]:
kaiju_virus_df = pd.read_csv("./454_seqs_kaiju.names.virus.txt",sep="\t",header=None,names=["classified","seqid","taxid","tax"])
kaiju_virus_df.head()


Out[23]:
classified seqid taxid tax
0 C FTSPZO101CHNQ7 196894 Viruses; Siphoviridae; NA;
1 C DBA-SLE_c5570 1341019 Viruses; Parvoviridae; Parvovirus NIH-CQV;
2 C GB3LKKR01ED1RA 1792245 Viruses; Myoviridae; Bacillus virus Deepblue;
3 C GB3LKKR01DUZ77 948870 Viruses; Myoviridae; Enterobacteria phage phi92;
4 C FTSPZO101E24BN 1608451 Viruses; NA; Phytophthora parasitica virus;

In [31]:
#It's empty !
kaiju_virus_df[kaiju_virus_df["seqid"].apply(lambda x: x in poophage_seqs)].shape


Out[31]:
(0, 4)

In [ ]: