In [1]:
from os.path import join, expandvars
from joblib import Parallel, delayed
from tax_credit.framework_functions import (runtime_make_test_data,
runtime_make_commands,
clock_runtime,
)
In [2]:
## project_dir should be the directory where you've downloaded (or cloned) the
## short-read-tax-assignment repository.
project_dir = expandvars("$HOME/Desktop/projects/short-read-tax-assignment")
data_dir = join(project_dir, "data")
results_dir = expandvars("$HOME/Desktop/projects/tax-credit-runtime")
runtime_results = join(results_dir, 'runtime_results.txt')
tmpdir = join(results_dir, 'tmp')
ref_db_dir = expandvars("$HOME/Desktop/projects/short-read-tax-assignment/data/ref_dbs/")
ref_seqs = join(ref_db_dir, 'gg_13_8_otus/99_otus_clean_515f-806r_trim250.fasta')
ref_taxa = join(ref_db_dir, 'gg_13_8_otus/99_otu_taxonomy_clean.tsv')
num_iters = 1
sampling_depths = [1, 10, 100, 1000, 10000]
In [44]:
runtime_make_test_data(ref_seqs, tmpdir, sampling_depths)
Finally we define the method, parameter combintations that we want to test and command templates to execute.
Template fields must adhere to following format:
{0} = output directory
{1} = input data
{2} = reference sequences
{3} = reference taxonomy
{4} = method name
{5} = other parameters
In [48]:
qiime1_template = 'source activate qiime1; source ~/.bashrc; assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}'
blast_template = 'source activate qiime2-2017.2; qiime tools import --input-path {1} --output-path {1}.qza --type "FeatureData[Sequence]"; qiime tools import --input-path {2} --output-path {2}.qza --type "FeatureData[Sequence]"; qiime tools import --input-path {3} --output-path {3}.qza --type "FeatureData[Taxonomy]"; qiime feature-classifier blast --i-query {1}.qza --o-classification {0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}'
vsearch_template = 'source activate qiime2-2017.2; qiime tools import --input-path {1} --output-path {1}.qza --type "FeatureData[Sequence]"; qiime tools import --input-path {2} --output-path {2}.qza --type "FeatureData[Sequence]"; qiime tools import --input-path {3} --output-path {3}.qza --type "FeatureData[Taxonomy]"; qiime feature-classifier vsearch --i-query {1}.qza --o-classification {0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}'
# {method: template, method-specific params}
methods = {
'rdp': (qiime1_template, '--confidence 0.5 --rdp_max_memory 16000'),
'uclust': (qiime1_template, '--min_consensus_fraction 0.51 --similarity 0.8 --uclust_max_accepts 3'),
'sortmerna': (qiime1_template, '--sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 '
'--sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8'),
'blast' : (qiime1_template, '-e 0.001'),
'blast+' : (blast_template, '--p-evalue 0.001'),
'vsearch' : (vsearch_template, '--p-min-id 0.90')
}
First we will vary the size of the reference database and search a single sequence against it.
In [49]:
commands_a = runtime_make_commands(tmpdir, tmpdir, methods, ref_taxa,
sampling_depths, num_iters=1, subsample_ref=True)
Next, we will vary the number of query seqs, and keep the number of ref seqs constant
In [50]:
commands_b = runtime_make_commands(tmpdir, tmpdir, methods, ref_taxa,
sampling_depths, num_iters=1, subsample_ref=False)
Let's look at the first command in each list and the total number of commands as a sanity check...
In [51]:
print(len(commands_a + commands_b))
print(commands_a[0])
print(commands_b[4])
In [54]:
Parallel(n_jobs=4)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))))
Out[54]:
In [ ]: