In [1]:
import glob
from os.path import join
import os
import csv
import shutil
import json
from itertools import product
from qiime2 import Artifact
from qiime2.plugins import feature_classifier
from q2_types.feature_data import DNAIterator
from q2_feature_classifier.classifier import \
spec_from_pipeline, pipeline_from_spec, _register_fitter
from pandas import DataFrame, Series
from tax_credit.framework_functions import \
generate_per_method_biom_tables, move_results_to_repository
In [2]:
project_dir = join('..', '..')
analysis_name = 'mock-community'
data_dir = join(project_dir, 'data', analysis_name)
precomputed_dir = join(project_dir, 'data', 'precomputed-results', analysis_name)
ref_db_dir = join(project_dir, 'data', 'ref_dbs')
gg_db = join(ref_db_dir, 'gg_13_8_otus/99_otus.fasta')
gg_tax = join(ref_db_dir, 'gg_13_8_otus/99_otu_taxonomy.txt')
unite_db = join(ref_db_dir, 'unite_20.11.2016/sh_refs_qiime_ver7_99_20.11.2016_dev_clean.fasta')
unite_tax = join(ref_db_dir, 'unite_20.11.2016/sh_taxonomy_qiime_ver7_99_20.11.2016_dev_clean.tsv')
results_dir = join(project_dir, 'temp_results_narrow')
if not os.path.exists(results_dir):
os.makedirs(results_dir)
mock_dirs = ['mock-' + str(m) for m in
list(range(1, 11)) + list(range(12,17)) +
list(range(18,26)) + ['26-ITS1', '26-ITS9']]
In [3]:
ref_dest = 'ref_dbs'
ref_16S = join(ref_dest, '99_gg_seq.qza')
ref_ITS = join(ref_dest, '99_unite_seq.qza')
tax_16S = join(ref_dest, '99_gg_tax.qza')
tax_ITS = join(ref_dest, '99_unite_tax.qza')
if not os.path.exists(join(results_dir, ref_dest)):
os.makedirs(join(results_dir, ref_dest))
ref = Artifact.import_data('FeatureData[Sequence]', gg_db)
ref.save(join(results_dir, ref_16S))
tax = Artifact.import_data('FeatureData[Taxonomy]', gg_tax,
view_type='HeaderlessTSVTaxonomyFormat')
tax.save(join(results_dir, tax_16S))
ref = Artifact.import_data('FeatureData[Sequence]', unite_db)
ref.save(join(results_dir, ref_ITS))
tax = Artifact.import_data('FeatureData[Taxonomy]', unite_tax,
view_type='HeaderlessTSVTaxonomyFormat')
tax.save(join(results_dir, tax_ITS));
In [4]:
ref_dbs = {}
taxs = {}
for mock in mock_dirs:
mockdir = join(data_dir, mock)
primer_file = join(mockdir, 'sample-metadata.tsv')
with open(primer_file) as csvfile:
data = next(csv.DictReader(csvfile, delimiter='\t'))
primers = [data['LinkerPrimerSequence'], data['ReversePrimer']]
sample_type = 'ITS' if 'ITS' in data['PrimerName'] else '16S'
assert sample_type == 'ITS' or '515f' in data['PrimerName']
if sample_type == '16S':
ref_dbs[mock] = [('gg_13_8_otus_full', ref_16S)]
taxs[mock] = tax_16S
ref = ref_16S
db_name = 'gg_13_8_otus_amplicon'
else:
ref_dbs[mock] = [('unite_20.11.2016_clean_full', ref_ITS)]
taxs[mock] = tax_ITS
ref = ref_ITS
db_name = 'unite_20.11.2016_clean_amplicon'
if primers[0] == 'CCGTGCCAGCMGCCGCGGTAA':
primers[0] = 'GTGCCAGCMGCCGCGGTAA'
elif primers[0] == 'ATCTTGGTCATTTAGAGGAAGTAA':
primers[0] = 'CTTGGTCATTTAGAGGAAGTAA'
elif 'I' in primers[0]:
primers[0] = primers[0].replace('I', 'N')
db_file = '_'.join(
[ref.rsplit('.',1)[0]] +
list(primers)) + '.qza'
ref_dbs[mock].append((db_name, db_file))
db_file = join(results_dir, db_file)
if not os.path.exists(db_file):
seqs = Artifact.load(join(results_dir, ref))
trimmed = feature_classifier.methods.extract_reads(
sequences=seqs, f_primer=primers[0], r_primer=primers[1]).reads
trimmed.save(db_file)
In [8]:
weights_dest = 'weights'
if not os.path.exists(join(results_dir, weights_dest)):
os.makedirs(join(results_dir, weights_dest))
priors_files = {}
for mock in mock_dirs:
print(mock)
mockdir = join(data_dir, mock)
for db_name, db_file in ref_dbs[mock]:
print(db_name)
tax_weights = Artifact.load(join(results_dir, taxs[mock]))
seq_ids = Artifact.load(join(results_dir, db_file))
seq_ids = {s.metadata['id'] for s in seq_ids.view(DNAIterator)}
tax_weights = tax_weights.view(Series)
tax_weights = {tax_weights[sid]:0. for sid in tax_weights.index
if sid in seq_ids}
weights = Artifact.load(join(mockdir, 'feature_table.qza'))
weights = weights.view(DataFrame)
if len(weights.index) > 1:
weights = {s:sum(weights.loc[s]) for s in weights.index}
total = sum(weights.values())
weights = {s:w/total for s, w in weights.items()}
else:
weights = {weights.index[0]: 1.}
et_path = join(precomputed_dir, mock)
if db_name.startswith('gg_13_8_otus'):
et_path = join(et_path, 'gg_13_8_otus')
else:
et_path = join(et_path, 'unite_20.11.2016_clean_fullITS')
et_path = join(et_path, 'expected', 'expected-taxonomy.tsv')
with open(et_path) as tf:
reader = csv.DictReader(tf, delimiter='\t')
for row in reader:
tax = row['Taxonomy']
weight = sum(weights[s]*float(row[s]) for s in weights)
try:
tax_weights[tax] += weight
except KeyError:
species = {t for t in tax_weights if t.startswith(tax)}
if len(species) == 0:
print(tax)
else:
for s in species:
tax_weights[s] += weight/len(species)
for tax in tax_weights:
if tax_weights[tax] < 1e-9:
tax_weights[tax] = 1e-9
total = sum(tax_weights.values())
weights = [tax_weights[t]/total for t in sorted(tax_weights)]
filename = mock + '-' + db_name + '-weights.json'
weights_file = join(weights_dest, filename)
priors_files[mock] = priors_files.get(mock, {})
priors_files[mock][db_name] = weights_file
weights_file = join(results_dir, weights_file)
with open(weights_file, 'w') as wf:
json.dump(weights, wf)
In [16]:
nb_sweep = \
{'feat-ext--analyzer': ['char'],
'feat-ext--n-features': [8192],
'feat-ext--ngram-range':
[[4,4], [6,6], [8,8], [16,16], [32,32],
[7,7], [9,9], [10,10], [11,11],
[12,12], [14,14], [18,18]],
'classify--alpha': [0.001]}
nb_bespoke_sweep = \
{'feat-ext--analyzer': ['char'],
'feat-ext--n-features': [8192],
'feat-ext--ngram-range': [[4,4], [6,6], [8,8], [16,16], [32,32],
[7,7], [9,9], [10,10], [11,11],
[12,12], [14,14], [18,18]],
'classify--alpha': [0.001],
'classify--class-prior': ['prior']}
classifier_params = {'naive-bayes': nb_sweep,
'naive-bayes-bespoke': nb_bespoke_sweep}
confidences = [0.0, 0.5, 0.7, 0.9, 0.92, 0.94,
0.96, 0.98, 1.0]
In [17]:
def get_classifier_command(method, inputs, params, priors):
cmd = ['qiime feature-classifier fit-classifier-naive-bayes']
cls = [method]
for param in sorted(inputs):
value = inputs[param]
cmd.extend(['--i-' + param, value])
cls.append(os.path.basename(value).split('.')[0])
for param in sorted(params):
value = params[param]
if value == 'prior':
cls.append(os.path.basename(priors).split('.')[0])
else:
cls.append(str(value).replace(' ',''))
if type(value) is bool:
cmd.append('--p-' + ('' if value else 'no-') + param)
continue
if 'class-prior' not in param:
value = json.dumps(value)
if value[0] != '"' or value[-1] != '"':
value = '"' + value + '"'
cmd.extend(['--p-' + param, value])
continue
if value == 'uniform':
continue
cmd.extend(['--p-' + param, '"`cat ' + priors + '`"'])
cls = ':'.join(cls) + '.qza'
cls = os.path.sep + join('state', 'partition1', 'tmp', 'classifiers_narrow', cls)
cmd.extend(['--o-classifier', '"' + cls + '"'])
cmd = ' '.join(cmd)
return cls, cmd
def get_classify_command(classifier, reads, params,
confidence, directory, results_dir):
cmd = ['qiime feature-classifier classify-sklearn']
cmd.extend(['--i-classifier', '"' + classifier + '"'])
cmd.extend(['--i-reads', reads])
cmd.extend(['--p-confidence', str(confidence)])
parameters = [str(params[p]).replace(' ', '') for p in sorted(params)]
parameters.append(str(confidence))
output_directory = join(directory, ':'.join(parameters))
if not os.path.exists(join(results_dir, output_directory)):
os.makedirs(join(results_dir, output_directory))
output = join(output_directory, 'rep_seqs_tax_assignments.qza')
cmd.extend(['--o-classification', '"' + output + '"'])
return output, ' '.join(cmd)
def get_combinations(params):
params, values = zip(*params.items())
for combination in product(*values):
yield dict(zip(params, combination))
In [19]:
if not os.path.exists(join(results_dir, 'classifiers')):
os.makedirs(join(results_dir, 'classifiers'))
classifier_commands = set()
classify_commands = []
classifiers = set()
classifications = []
for mock in mock_dirs:
reads = join('..', 'data', 'mock-community', mock, 'rep_seqs.qza')
mock_directory = join('classifications', mock)
inputs = {'reference-taxonomy': taxs[mock]}
for db_name, db_file in ref_dbs[mock]:
db_directory = join(mock_directory, db_name)
inputs['reference-reads'] = db_file
for method in classifier_params:
method_directory = join(db_directory, method)
for params in get_combinations(classifier_params[method]):
priors = priors_files[mock][db_name]
classifier, command = get_classifier_command(method, inputs, params, priors)
classifier_commands.add(command)
classifiers.add(classifier)
for confidence in confidences:
classification, command = get_classify_command(
classifier, reads, params, confidence,
method_directory, results_dir)
classifications.append(classification)
classify_commands.append(command)
# write out the commands
with open(join(results_dir, 'classifier_commands.sh'), 'w') as cmds:
for cmd in classifier_commands:
cmds.write(cmd + '\n')
with open(join(results_dir, 'classify_commands.sh'), 'w') as cmds:
for cmd in classify_commands:
cmds.write(cmd + '\n')
In [20]:
bad_classifications = []
for classification in classifications:
full_classification = join(results_dir, classification)
output_dir = os.path.dirname(full_classification)
taxonomy_map_fp = join(output_dir, 'taxonomy.tsv')
if not os.path.exists(taxonomy_map_fp):
try:
Artifact.load(full_classification).export_data(output_dir)
except ValueError:
bad_classifications.append(classification)
In [21]:
print(len(bad_classifications), "bad classifications")
bc_combinations = None
for bc in bad_classifications:
if '[4,16]' not in bc:
print(bc)
continue
sbc = []
for tbc in bc.split(os.path.sep):
sbc.extend(tbc.split(':'))
if bc_combinations is None:
bc_combinations = [{tbc} for tbc in sbc]
else:
for tbc, bcc in zip(sbc, bc_combinations):
bcc.add(tbc)
print(bc_combinations)
In [22]:
taxonomy_glob = join(results_dir, 'classifications', 'mock-*', '*', 'naive-bayes*', '*', 'taxonomy.tsv')
generate_per_method_biom_tables(taxonomy_glob, data_dir)
In [23]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", analysis_name)
method_dirs = glob.glob(join(results_dir, 'classifications', 'mock-*', '*', 'naive-bayes*', '*'))
move_results_to_repository(method_dirs, precomputed_results_dir)
In [ ]: