In [1]:
from ClassifyContigNR import *
import tempfile
import pandas as pd

Test Data


In [2]:
bed_content_lists = [["contig1","1","816","PROKKA_MOD_PART0_00001"]]

blast_result_content_lists_single = [["PROKKA_MOD_PART0_00001","WP_066153451.1","89.0","271","48","0","1","272","179","449","6.7e-131","475.7"]]

blast_result_content_lists = [["PROKKA_MOD_PART0_00001","WP_066153451.1","69.0","271","48","0","1","272","179","449","6.7e-131","475.7"],
                              ["PROKKA_MOD_PART0_00001","WP_056663091.1","69.0","271","50","0","1","272","179","449","8.7e-131","475.3"],
                              ["PROKKA_MOD_PART0_00001","WP_009516566.1","69.0","271","47","0","1","272","179","449","1.5e-130","474.6"]]

lineage_content_lists = [["545","Bacteria","Proteobacteria","Gammaproteobacteria","Enterobacterales","Enterobacteriaceae","Citrobacter","Citrobacter koseri"],
                         ["562","Bacteria","Proteobacteria","Gammaproteobacteria","Enterobacterales","Enterobacteriaceae","Escherichia","Escherichia coli"],
                         ["666","Bacteria","Proteobacteria","Gammaproteobacteria","Vibrionales","Vibrionaceae","Vibrio","Vibrio cholerae"]]

### Completely Fake ###
protein_to_taxid_lists = [["acc","ver","taxid","xx"],
                          ["WP_066153451","WP_066153451.1","545","---"],
                          ["WP_009516566","WP_056663091.1","562","---"],
                          ["WP_009516566","WP_009516566.1","666","---"]]

Methods to load test data


In [3]:
def fake_file(content):
    fh = tempfile.NamedTemporaryFile(mode='w', encoding='utf8')
    fh.write(content)
    fh.seek(0)
    return fh

In [4]:
def preprocess_data(bed_content_lists, blast_result_content_lists, lineage_content_lists, protein_to_taxid_lists):
    bed_content="\n".join("\t".join(items) for items in bed_content_lists)

    blast_result_content = "\n".join(["\t".join(items) for items in blast_result_content_lists])
    blast_result_content_single = "\n".join(["\t".join(items) for items in blast_result_content_lists])

    lineage_content = "\n".join(["\t".join(items) for items in lineage_content_lists])

    protein_to_taxid_content = "\n".join(["\t".join(items) for items in protein_to_taxid_lists])
    
    fake_bed_file = fake_file(bed_content)
    with open(fake_bed_file.name, 'r') as fh:
        fh.seek(0)
        (lengths, contigGenes, contigLengths) = read_bed_lines(fh)
        
    fake_blast_file = fake_file(blast_result_content)
    with open(fake_blast_file.name, 'r') as fh:
        fh.seek(0)
        matches, gids = read_blast_lines(fh, lengths, True, 40.0)
        
    fake_lineage_file = fake_file(lineage_content)
    with open(fake_lineage_file.name, 'r') as fh:
        fh.seek(0)
        lineages, mapBack = read_lineage_lines(fh)
        
    fake_prot_file = fake_file(protein_to_taxid_content)
    with open(fake_prot_file.name, 'r') as fh:
        fh.seek(0)
        mapping = map_accessions(gids, fh)
        
    taxa_identities = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
    
    return matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities

Run with only one hit


In [5]:
matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities = \
    preprocess_data(bed_content_lists, blast_result_content_lists_single, lineage_content_lists, protein_to_taxid_lists)


0.89 WP_066153451.1

In [6]:
contigAssign, geneAssign = assign_taxonomy(matches, mapping, mapBack, lineages, lengths, contigGenes, 0.5, taxa_identities)

In [7]:
pd.DataFrame.from_dict(geneAssign, orient='index')


Out[7]:
0 1 2 3 4 5 6
PROKKA_MOD_PART0_00001 (Bacteria, 1.0) (Proteobacteria, 1.0) (Gammaproteobacteria, 1.0) (Enterobacterales, 1.0) (Enterobacteriaceae, 1.0) (Unclassified, -1.0) (Unclassified, -1.0)

Run with three hits


In [8]:
matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities = \
    preprocess_data(bed_content_lists, blast_result_content_lists, lineage_content_lists, protein_to_taxid_lists)
contigAssign, geneAssign = assign_taxonomy(matches, mapping, mapBack, lineages, lengths, contigGenes, 0.5, taxa_identities)
pd.DataFrame.from_dict(geneAssign, orient='index')


0.99 WP_066153451.1
0.99 WP_056663091.1
0.99 WP_009516566.1
Out[8]:
0 1 2 3 4 5 6
PROKKA_MOD_PART0_00001 (Bacteria, 1.0) (Proteobacteria, 1.0) (Gammaproteobacteria, 1.0) (Enterobacterales, 1.0) (Enterobacteriaceae, 0.6666666666666667) (Unclassified, -1.0) (Unclassified, -1.0)