In [1]:
from os.path import join, expandvars
from joblib import Parallel, delayed
from tax_credit.framework_functions import (runtime_make_test_data,
runtime_make_commands,
clock_runtime)
In [2]:
## project_dir should be the directory where you've downloaded (or cloned) the
## tax-credit repository.
project_dir = '../..'
data_dir = join(project_dir, "data")
results_dir = join(project_dir, 'temp_results_runtime')
runtime_results = join(results_dir, 'runtime_results.txt')
tmpdir = join(results_dir, 'tmp')
ref_db_dir = join(project_dir, 'data/ref_dbs/gg_13_8_otus')
ref_seqs = join(ref_db_dir, '99_otus_clean_515f-806r_trim250.fasta')
ref_taxa = join(ref_db_dir, '99_otu_taxonomy_clean.tsv')
num_iters = 1
sampling_depths = [1] + list(range(2000,10001,2000))
In [3]:
runtime_make_test_data(ref_seqs, tmpdir, sampling_depths)
Import to qiime for q2-feature-classifier methods, train scikit-learn classifiers. We do not include the training step in the runtime analysis, because under normal operating conditions a reference dataset will be trained once, then re-used many times for any datasets that use the same marker gene (e.g., 16S rRNA). Separating the training step from the classification step was a conscious decision on part of the designers to make classification as quick as possible, and removing redundant training steps!
In [4]:
! qiime tools import --input-path {ref_taxa} --output-path {ref_taxa}.qza --type "FeatureData[Taxonomy]" --source-format HeaderlessTSVTaxonomyFormat
for depth in sampling_depths:
tmpfile = join(tmpdir, str(depth)) + '.fna'
! qiime tools import --input-path {tmpfile} --output-path {tmpfile}.qza --type "FeatureData[Sequence]"
! qiime feature-classifier fit-classifier-naive-bayes --o-classifier {tmpfile}.nb.qza --i-reference-reads {tmpfile}.qza --i-reference-taxonomy {ref_taxa}.qza
Finally we define the method, parameter combintations that we want to test and command templates to execute.
Template fields must adhere to following format:
{0} = output directory
{1} = input data
{2} = reference sequences
{3} = reference taxonomy
{4} = method name
{5} = other parameters
In [5]:
qiime1_setup = join(results_dir, '.bashrc')
qiime1_template = ('bash -c "source activate qiime1; source ' + qiime1_setup + '; '
'assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}"')
blast_template = ('qiime feature-classifier classify-consensus-blast --i-query {1}.qza --o-classification '
'{0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}')
vsearch_template = ('qiime feature-classifier classify-consensus-vsearch --i-query {1}.qza '
'--o-classification {0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}')
naive_bayes_template = ('qiime feature-classifier classify-sklearn '
'--o-classification {0}/assign.tmp --i-classifier {2}.nb.qza --i-reads {1}.qza {5}')
# {method: template, method-specific params}
methods = {
'rdp': (qiime1_template, '--confidence 0.5 --rdp_max_memory 16000'),
'uclust': (qiime1_template, '--min_consensus_fraction 0.51 --similarity 0.8 --uclust_max_accepts 3'),
'sortmerna': (qiime1_template, '--sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 '
'--sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8'),
'blast' : (qiime1_template, '-e 0.001'),
'blast+' : (blast_template, '--p-evalue 0.001'),
'vsearch' : (vsearch_template, '--p-perc-identity 0.90'),
'naive-bayes': (naive_bayes_template, '--p-confidence 0.7')
}
First we will vary the size of the reference database and search a single sequence against it.
In [6]:
commands_a = runtime_make_commands(tmpdir, tmpdir, methods, ref_taxa,
sampling_depths, num_iters=1, subsample_ref=True)
Next, we will vary the number of query seqs, and keep the number of ref seqs constant
In [7]:
commands_b = runtime_make_commands(tmpdir, tmpdir, methods, ref_taxa,
sampling_depths, num_iters=1, subsample_ref=False)
Let's look at the first command in each list and the total number of commands as a sanity check...
In [8]:
print(len(commands_a + commands_b))
print(commands_a[1])
print(commands_b[-1])
In [9]:
Parallel(n_jobs=23)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))));
Out[9]: