nb-extra
Mock SweepsThis script is for computing performing parameter sweeps using the naive Bayes classifier that is used in q2-feature-classifier
. The number of parameters was large so it was useful to be able to run the commands on a larger system.
This script creates a temporary directory (results_dir
) and generates two sets of shell commands. The commands end up in classifier_commands.sh
and classify_commands.sh
in that directory. The former must complete before the latter is started. The commands rely on the Python modules in https://github.com/BenKaehler/q2-extra-classifier.
Once the commands are generated in the "Classifier fitting scripts" section, they can be run anywhere by copying the whole results_dir
directory. The results_dir
directory should then by syncronised back to its original location. The remainder of the script the copies the results into tax-credit
.
In [1]:
import glob
from os.path import join
import os
import csv
import shutil
import json
from itertools import product
from qiime2 import Artifact
from qiime2.plugins import feature_classifier
from q2_types.feature_data import DNAIterator
from q2_feature_classifier.classifier import \
spec_from_pipeline, pipeline_from_spec, _register_fitter
from pandas import DataFrame, Series
from tax_credit.framework_functions import \
generate_per_method_biom_tables, move_results_to_repository
In [2]:
project_dir = join('..', '..')
analysis_name = 'mock-community'
data_dir = join(project_dir, 'data', analysis_name)
precomputed_dir = join(project_dir, 'data', 'precomputed-results', analysis_name)
ref_db_dir = join(project_dir, 'data', 'ref_dbs')
gg_db = join(ref_db_dir, 'gg_13_8_otus/99_otus.fasta')
gg_tax = join(ref_db_dir, 'gg_13_8_otus/99_otu_taxonomy.txt')
unite_db = join(ref_db_dir, 'unite_20.11.2016/sh_refs_qiime_ver7_99_20.11.2016_dev_clean.fasta')
unite_tax = join(ref_db_dir, 'unite_20.11.2016/sh_taxonomy_qiime_ver7_99_20.11.2016_dev_clean.tsv')
results_dir = join(project_dir, 'temp_results')
if not os.path.exists(results_dir):
os.makedirs(results_dir)
mock_dirs = ['mock-' + str(m) for m in (3, 12, 18, 22, 24, '26-ITS1', '26-ITS9')]
In [3]:
ref_dest = 'ref_dbs'
refs = {m: join(ref_dest, '99_gg_seq.qza') for m in mock_dirs[:4]}
refs.update({m: join(ref_dest, '99_unite_seq.qza') for m in mock_dirs[4:]})
taxs = {m: join(ref_dest, '99_gg_tax.qza') for m in mock_dirs[:4]}
taxs.update({m: join(ref_dest, '99_unite_tax.qza') for m in mock_dirs[4:]})
if not os.path.exists(join(results_dir, ref_dest)):
os.makedirs(join(results_dir, ref_dest))
ref = Artifact.import_data('FeatureData[Sequence]', gg_db)
ref.save(join(results_dir, refs['mock-3']))
with open(gg_tax) as fh:
reader = csv.reader(fh, delimiter='\t')
data = {k:v for k, v in reader}
data = Series(data, name='Taxon')
data.index.name='Feature ID'
tax = Artifact.import_data('FeatureData[Taxonomy]', data)
tax.save(join(results_dir, taxs['mock-3']))
ref = Artifact.import_data('FeatureData[Sequence]', unite_db)
ref.save(join(results_dir, refs['mock-24']))
with open(unite_tax) as fh:
reader = csv.reader(fh, delimiter='\t')
data = {k:v for k, v in reader}
data = Series(data, name='Taxon')
data.index.name='Feature ID'
tax = Artifact.import_data('FeatureData[Taxonomy]', data)
tax.save(join(results_dir, taxs['mock-24']));
In [4]:
def guess_read_length(seqs):
seqs = Artifact.load(seqs)
lengths = [len(s) for s in seqs.view(DNAIterator)]
lengths.sort()
return lengths[len(lengths)//2]
def load_primers(primer_file):
with open(primer_file) as csvfile:
data = next(csv.DictReader(csvfile, delimiter='\t'))
return data['LinkerPrimerSequence'], data['ReversePrimer']
ref_dbs = {}
for mock in mock_dirs:
mockdir = join(data_dir, mock)
repseqs = join(mockdir, 'rep_seqs.qza')
if 'gg' in refs[mock]:
db_name = 'gg_13_8_otus_full'
else:
db_name = 'unite_20.11.2016_clean_full'
ref_dbs[mock] = [(db_name, refs[mock])]
if not os.path.exists(repseqs):
continue
primerfile = join(mockdir, 'sample-metadata.tsv')
primers = list(load_primers(primerfile))
if primers[0] == 'CCGTGCCAGCMGCCGCGGTAA':
primers[0] = 'GTGCCAGCMGCCGCGGTAA'
elif 'I' in primers[0]:
primers[0] = primers[0].replace('I', 'N')
readlength = guess_read_length(repseqs)
print(os.path.basename(mockdir), str(readlength), *primers)
ref = Artifact.load(join(results_dir, refs[mock]))
db_file = '_'.join(
[refs[mock].rsplit('.',1)[0], str(readlength)] +
list(primers)) + '.qza'
if 'gg' in refs[mock]:
db_name = 'gg_13_8_otus_read'
else:
db_name = 'unite_20.11.2016_clean_read'
ref_dbs[mock].append((db_name, db_file))
db_file = join(results_dir, db_file)
if not os.path.exists(db_file):
trimmed = feature_classifier.methods.extract_reads(
sequences=ref, trunc_len=readlength,
f_primer=primers[0], r_primer=primers[1]).reads
trimmed.save(db_file)
db_file = '_'.join(
[refs[mock].rsplit('.',1)[0]] +
list(primers)) + '.qza'
if 'gg' in refs[mock]:
db_name = 'gg_13_8_otus_amplicon'
else:
db_name = 'unite_20.11.2016_clean_amplicon'
ref_dbs[mock].append((db_name, db_file))
db_file = join(results_dir, db_file)
if not os.path.exists(db_file):
trimmed = feature_classifier.methods.extract_reads(
sequences=ref, f_primer=primers[0], r_primer=primers[1]).reads
trimmed.save(db_file)
In [5]:
weights_dest = 'weights'
if not os.path.exists(join(results_dir, weights_dest)):
os.makedirs(join(results_dir, weights_dest))
priors_files = {}
for mock in mock_dirs:
print(mock)
mockdir = join(data_dir, mock)
for db_name, db_file in ref_dbs[mock]:
print(db_name)
tax_weights = Artifact.load(join(results_dir, taxs[mock]))
seq_ids = Artifact.load(join(results_dir, db_file))
seq_ids = {s.metadata['id'] for s in seq_ids.view(DNAIterator)}
tax_weights = tax_weights.view(Series)
tax_weights = {tax_weights[sid]:0. for sid in tax_weights.index
if sid in seq_ids}
weights = Artifact.load(join(mockdir, 'feature_table.qza'))
weights = weights.view(DataFrame)
if len(weights.index) > 1:
weights = {s:sum(weights.loc[s]) for s in weights.index}
total = sum(weights.values())
weights = {s:w/total for s, w in weights.items()}
else:
weights = {weights.index[0]: 1.}
et_path = join(precomputed_dir, mock)
if db_name.startswith('gg_13_8_otus'):
et_path = join(et_path, 'gg_13_8_otus')
else:
et_path = join(et_path, 'unite_20.11.2016_clean_fullITS')
et_path = join(et_path, 'expected', 'expected-taxonomy.tsv')
with open(et_path) as tf:
reader = csv.DictReader(tf, delimiter='\t')
for row in reader:
tax = row['Taxonomy']
weight = sum(weights[s]*float(row[s]) for s in weights)
try:
tax_weights[tax] += weight
except KeyError:
species = {t for t in tax_weights if t.startswith(tax)}
if len(species) == 0:
print(tax)
else:
for s in species:
tax_weights[s] += weight/len(species)
for tax in tax_weights:
if tax_weights[tax] < 1e-9:
tax_weights[tax] = 1e-9
total = sum(tax_weights.values())
weights = [tax_weights[t]/total for t in sorted(tax_weights)]
filename = mock + '-' + db_name + '-weights.json'
weights_file = join(weights_dest, filename)
priors_files[mock] = priors_files.get(mock, {})
priors_files[mock][db_name] = weights_file
weights_file = join(results_dir, weights_file)
with open(weights_file, 'w') as wf:
json.dump(weights, wf)
In [6]:
nb_extra_sweep = \
{'feat-ext--n-features': [1024, 8192, 65536],
'feat-ext--ngram-range': [[4,4], [8, 8], [16, 16], [4,16]],
'norm--norm': ['l1', 'l2', None],
'norm--use-idf': [True, False],
'classify--alpha': [0.001, 0.01, 0.1],
'classify--class-prior': ['uniform', 'prior']}
classifier_params = {'nb-extra': nb_extra_sweep}
confidences = [0., 0.2, 0.4, 0.6, 0.8]
In [7]:
def get_classifier_command(method, inputs, params, priors):
cmd = ['qiime feature-classifier fit-classifier-' + method]
cls = [method]
for param in sorted(inputs):
value = inputs[param]
cmd.extend(['--i-' + param, value])
cls.append(os.path.basename(value).split('.')[0])
for param in sorted(params):
value = params[param]
if value == 'prior':
cls.append(os.path.basename(priors).split('.')[0])
else:
cls.append(str(value).replace(' ',''))
if type(value) is bool:
cmd.append('--p-' + ('' if value else 'no-') + param)
continue
if 'class-prior' not in param:
value = json.dumps(value)
if value[0] != '"' or value[-1] != '"':
value = '"' + value + '"'
cmd.extend(['--p-' + param, value])
continue
if value == 'uniform':
continue
cmd.extend(['--p-' + param, '"`cat ' + priors + '`"'])
cls = ':'.join(cls) + '.qza'
cls = os.path.sep + join('state', 'partition1', 'tmp', 'classifiers', cls)
cmd.extend(['--o-classifier', '"' + cls + '"'])
cmd = ' '.join(cmd)
return cls, cmd
def get_classify_command(classifier, reads, params,
confidence, directory, results_dir):
cmd = ['qiime feature-classifier classify-sklearn']
cmd.extend(['--i-classifier', '"' + classifier + '"'])
cmd.extend(['--i-reads', reads])
cmd.extend(['--p-confidence', str(confidence)])
parameters = [str(params[p]).replace(' ', '') for p in sorted(params)]
parameters.append(str(confidence))
output_directory = join(directory, ':'.join(parameters))
if not os.path.exists(join(results_dir, output_directory)):
os.makedirs(join(results_dir, output_directory))
output = join(output_directory, 'rep_seqs_tax_assignments.qza')
cmd.extend(['--o-classification', '"' + output + '"'])
return output, ' '.join(cmd)
def get_combinations(params):
params, values = zip(*params.items())
for combination in product(*values):
yield dict(zip(params, combination))
In [8]:
if not os.path.exists(join(results_dir, 'classifiers')):
os.makedirs(join(results_dir, 'classifiers'))
classifier_commands = set()
classify_commands = []
classifiers = set()
classifications = []
for mock in mock_dirs:
reads = join('..', 'data', 'mock-community', mock, 'rep_seqs.qza')
mock_directory = join('classifications', mock)
inputs = {'reference-taxonomy': taxs[mock]}
for db_name, db_file in ref_dbs[mock]:
db_directory = join(mock_directory, db_name)
inputs['reference-reads'] = db_file
for method in classifier_params:
method_directory = join(db_directory, method)
for params in get_combinations(classifier_params[method]):
priors = priors_files[mock][db_name]
classifier, command = get_classifier_command(method, inputs, params, priors)
classifier_commands.add(command)
classifiers.add(classifier)
for confidence in confidences:
classification, command = get_classify_command(
classifier, reads, params, confidence,
method_directory, results_dir)
classifications.append(classification)
classify_commands.append(command)
# write out the commands
with open(join(results_dir, 'classifier_commands.sh'), 'w') as cmds:
for cmd in classifier_commands:
cmds.write(cmd + '\n')
with open(join(results_dir, 'classify_commands.sh'), 'w') as cmds:
for cmd in classify_commands:
cmds.write(cmd + '\n')
In [9]:
bad_classifications = []
for classification in classifications:
full_classification = join(results_dir, classification)
output_dir = os.path.dirname(full_classification)
taxonomy_map_fp = join(output_dir, 'taxonomy.tsv')
if not os.path.exists(taxonomy_map_fp):
try:
Artifact.load(full_classification).export_data(output_dir)
except ValueError:
bad_classifications.append(classification)
In [18]:
print(len(bad_classifications), "bad classifications")
bc_combinations = None
for bc in bad_classifications:
if '[4,16]' not in bc:
print(bc)
continue
sbc = []
for tbc in bc.split(os.path.sep):
sbc.extend(tbc.split(':'))
if bc_combinations is None:
bc_combinations = [{tbc} for tbc in sbc]
else:
for tbc, bcc in zip(sbc, bc_combinations):
bcc.add(tbc)
print(bc_combinations)
In [19]:
taxonomy_glob = join(results_dir, 'classifications', 'mock-*', '*', 'nb-extra', '*', 'taxonomy.tsv')
generate_per_method_biom_tables(taxonomy_glob, data_dir)
In [20]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", analysis_name)
method_dirs = glob.glob(join(results_dir, 'classifications', 'mock-*', '*', 'nb-extra', '*'))
move_results_to_repository(method_dirs, precomputed_results_dir)
In [ ]: