In [1]:
from ClassifyContigNR import *
import tempfile
import pandas as pd
In [2]:
bed_content_lists = [["contig1","1","816","PROKKA_MOD_PART0_00001"]]
blast_result_content_lists_single = [["PROKKA_MOD_PART0_00001","WP_066153451.1","89.0","271","48","0","1","272","179","449","6.7e-131","475.7"]]
blast_result_content_lists = [["PROKKA_MOD_PART0_00001","WP_066153451.1","69.0","271","48","0","1","272","179","449","6.7e-131","475.7"],
["PROKKA_MOD_PART0_00001","WP_056663091.1","69.0","271","50","0","1","272","179","449","8.7e-131","475.3"],
["PROKKA_MOD_PART0_00001","WP_009516566.1","69.0","271","47","0","1","272","179","449","1.5e-130","474.6"]]
lineage_content_lists = [["545","Bacteria","Proteobacteria","Gammaproteobacteria","Enterobacterales","Enterobacteriaceae","Citrobacter","Citrobacter koseri"],
["562","Bacteria","Proteobacteria","Gammaproteobacteria","Enterobacterales","Enterobacteriaceae","Escherichia","Escherichia coli"],
["666","Bacteria","Proteobacteria","Gammaproteobacteria","Vibrionales","Vibrionaceae","Vibrio","Vibrio cholerae"]]
### Completely Fake ###
protein_to_taxid_lists = [["acc","ver","taxid","xx"],
["WP_066153451","WP_066153451.1","545","---"],
["WP_009516566","WP_056663091.1","562","---"],
["WP_009516566","WP_009516566.1","666","---"]]
In [3]:
def fake_file(content):
fh = tempfile.NamedTemporaryFile(mode='w', encoding='utf8')
fh.write(content)
fh.seek(0)
return fh
In [4]:
def preprocess_data(bed_content_lists, blast_result_content_lists, lineage_content_lists, protein_to_taxid_lists):
bed_content="\n".join("\t".join(items) for items in bed_content_lists)
blast_result_content = "\n".join(["\t".join(items) for items in blast_result_content_lists])
blast_result_content_single = "\n".join(["\t".join(items) for items in blast_result_content_lists])
lineage_content = "\n".join(["\t".join(items) for items in lineage_content_lists])
protein_to_taxid_content = "\n".join(["\t".join(items) for items in protein_to_taxid_lists])
fake_bed_file = fake_file(bed_content)
with open(fake_bed_file.name, 'r') as fh:
fh.seek(0)
(lengths, contigGenes, contigLengths) = read_bed_lines(fh)
fake_blast_file = fake_file(blast_result_content)
with open(fake_blast_file.name, 'r') as fh:
fh.seek(0)
matches, gids = read_blast_lines(fh, lengths, True, 40.0)
fake_lineage_file = fake_file(lineage_content)
with open(fake_lineage_file.name, 'r') as fh:
fh.seek(0)
lineages, mapBack = read_lineage_lines(fh)
fake_prot_file = fake_file(protein_to_taxid_content)
with open(fake_prot_file.name, 'r') as fh:
fh.seek(0)
mapping = map_accessions(gids, fh)
taxa_identities = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
return matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities
In [5]:
matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities = \
preprocess_data(bed_content_lists, blast_result_content_lists_single, lineage_content_lists, protein_to_taxid_lists)
In [6]:
contigAssign, geneAssign = assign_taxonomy(matches, mapping, mapBack, lineages, lengths, contigGenes, 0.5, taxa_identities)
In [7]:
pd.DataFrame.from_dict(geneAssign, orient='index')
Out[7]:
In [8]:
matches, mapping, mapBack, lineages, lengths, contigGenes, taxa_identities = \
preprocess_data(bed_content_lists, blast_result_content_lists, lineage_content_lists, protein_to_taxid_lists)
contigAssign, geneAssign = assign_taxonomy(matches, mapping, mapBack, lineages, lengths, contigGenes, 0.5, taxa_identities)
pd.DataFrame.from_dict(geneAssign, orient='index')
Out[8]: