taxonomy-assignment-nb-extra-commands


Generate Commands for nb-extra Mock Sweeps

This script is for computing performing parameter sweeps using the naive Bayes classifier that is used in q2-feature-classifier. The number of parameters was large so it was useful to be able to run the commands on a larger system.

This script creates a temporary directory (results_dir) and generates two sets of shell commands. The commands end up in classifier_commands.sh and classify_commands.sh in that directory. The former must complete before the latter is started. The commands rely on the Python modules in https://github.com/BenKaehler/q2-extra-classifier.

Once the commands are generated in the "Classifier fitting scripts" section, they can be run anywhere by copying the whole results_dir directory. The results_dir directory should then by syncronised back to its original location. The remainder of the script the copies the results into tax-credit.


In [1]:
import glob
from os.path import join
import os
import csv
import shutil
import json
from itertools import product

from qiime2 import Artifact
from qiime2.plugins import feature_classifier
from q2_types.feature_data import DNAIterator
from q2_feature_classifier.classifier import \
    spec_from_pipeline, pipeline_from_spec, _register_fitter
from pandas import DataFrame, Series

from tax_credit.framework_functions import \
    generate_per_method_biom_tables, move_results_to_repository

File Paths and Communities


In [2]:
project_dir = join('..', '..')
analysis_name = 'mock-community'
data_dir = join(project_dir, 'data', analysis_name)
precomputed_dir = join(project_dir, 'data', 'precomputed-results', analysis_name)

ref_db_dir = join(project_dir, 'data', 'ref_dbs')

gg_db = join(ref_db_dir, 'gg_13_8_otus/99_otus.fasta')
gg_tax = join(ref_db_dir, 'gg_13_8_otus/99_otu_taxonomy.txt')
unite_db = join(ref_db_dir, 'unite_20.11.2016/sh_refs_qiime_ver7_99_20.11.2016_dev_clean.fasta')
unite_tax = join(ref_db_dir, 'unite_20.11.2016/sh_taxonomy_qiime_ver7_99_20.11.2016_dev_clean.tsv')

results_dir = join(project_dir, 'temp_results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

mock_dirs = ['mock-' + str(m) for m in (3, 12, 18, 22, 24, '26-ITS1', '26-ITS9')]

Import Reference Databases


In [3]:
ref_dest = 'ref_dbs'
refs = {m: join(ref_dest, '99_gg_seq.qza') for m in mock_dirs[:4]}
refs.update({m: join(ref_dest, '99_unite_seq.qza') for m in mock_dirs[4:]})
taxs = {m: join(ref_dest, '99_gg_tax.qza') for m in mock_dirs[:4]}
taxs.update({m: join(ref_dest, '99_unite_tax.qza') for m in mock_dirs[4:]})

if not os.path.exists(join(results_dir, ref_dest)):
    os.makedirs(join(results_dir, ref_dest))
ref = Artifact.import_data('FeatureData[Sequence]', gg_db)
ref.save(join(results_dir, refs['mock-3']))
with open(gg_tax) as fh:
    reader = csv.reader(fh, delimiter='\t')
    data = {k:v for k, v in reader}
data = Series(data, name='Taxon')
data.index.name='Feature ID'
tax = Artifact.import_data('FeatureData[Taxonomy]', data)
tax.save(join(results_dir, taxs['mock-3']))
ref = Artifact.import_data('FeatureData[Sequence]', unite_db)
ref.save(join(results_dir, refs['mock-24']))
with open(unite_tax) as fh:
    reader = csv.reader(fh, delimiter='\t')
    data = {k:v for k, v in reader}
data = Series(data, name='Taxon')
data.index.name='Feature ID'
tax = Artifact.import_data('FeatureData[Taxonomy]', data)
tax.save(join(results_dir, taxs['mock-24']));

Amplicon and Read Extraction


In [4]:
def guess_read_length(seqs):
    seqs = Artifact.load(seqs)
    lengths = [len(s) for s in seqs.view(DNAIterator)]
    lengths.sort()
    return lengths[len(lengths)//2]

def load_primers(primer_file):
    with open(primer_file) as csvfile:
        data = next(csv.DictReader(csvfile, delimiter='\t'))
        return data['LinkerPrimerSequence'], data['ReversePrimer']

ref_dbs = {}
for mock in mock_dirs:
    mockdir = join(data_dir, mock)
    repseqs = join(mockdir, 'rep_seqs.qza')
    
    if 'gg' in refs[mock]:
        db_name = 'gg_13_8_otus_full'
    else:
        db_name = 'unite_20.11.2016_clean_full'
    ref_dbs[mock] = [(db_name, refs[mock])]
    
    if not os.path.exists(repseqs):
        continue
    primerfile = join(mockdir, 'sample-metadata.tsv')
    primers = list(load_primers(primerfile))
    if primers[0] == 'CCGTGCCAGCMGCCGCGGTAA':
        primers[0] = 'GTGCCAGCMGCCGCGGTAA'
    elif 'I' in primers[0]:
        primers[0] = primers[0].replace('I', 'N')
    readlength = guess_read_length(repseqs)
    print(os.path.basename(mockdir), str(readlength), *primers)
    
    ref = Artifact.load(join(results_dir, refs[mock]))
    db_file = '_'.join(
        [refs[mock].rsplit('.',1)[0], str(readlength)] + 
        list(primers)) + '.qza'
    if 'gg' in refs[mock]:
        db_name = 'gg_13_8_otus_read'
    else:
        db_name = 'unite_20.11.2016_clean_read'
    ref_dbs[mock].append((db_name, db_file))
    db_file = join(results_dir, db_file)
    if not os.path.exists(db_file):
        trimmed = feature_classifier.methods.extract_reads(
                    sequences=ref, trunc_len=readlength,
                    f_primer=primers[0], r_primer=primers[1]).reads
        trimmed.save(db_file)
    
    db_file = '_'.join(
        [refs[mock].rsplit('.',1)[0]] + 
        list(primers)) + '.qza'
    if 'gg' in refs[mock]:
        db_name = 'gg_13_8_otus_amplicon'
    else:
        db_name = 'unite_20.11.2016_clean_amplicon'
    ref_dbs[mock].append((db_name, db_file))
    db_file = join(results_dir, db_file)
    if not os.path.exists(db_file):
        trimmed = feature_classifier.methods.extract_reads(
                    sequences=ref, f_primer=primers[0], r_primer=primers[1]).reads
        trimmed.save(db_file)


mock-3 150 GTGCCAGCMGCCGCGGTAA GGACTACHVGGGTWTCTAAT
mock-12 230 GTGCCAGCMGCCGCGGTAA GGACTACHVGGGTWTCTAAT
mock-18 212 GTGCCAGCMGCCGCGGTAA GGACTACHVGGGTWTCTAAT
mock-22 231 GTGCCAGCMGCCGCGGTAA GGACTACHVGGGTWTCTAAT
mock-24 150 AACTTTYRRCAAYGGATCWCT AGCCTCCGCTTATTGATATGCTTAART
mock-26-ITS1 290 CTTGGTCATTTAGAGGAAGTAA TCCTCCGCTTATTGATATGC
mock-26-ITS9 290 GAACGCAGCRAANNGYGA TCCTCCGCTTATTGATATGC

Find Class Weights

map taxonomies to taxonomy labels using tax-credit/data/mock-community/mock-3/expected-taxonomy.tsv, dickhead


In [5]:
weights_dest = 'weights'
if not os.path.exists(join(results_dir, weights_dest)):
    os.makedirs(join(results_dir, weights_dest))

priors_files = {}
for mock in mock_dirs:
    print(mock)
    mockdir = join(data_dir, mock)
    for db_name, db_file in ref_dbs[mock]:
        print(db_name)
        tax_weights = Artifact.load(join(results_dir, taxs[mock]))
        seq_ids = Artifact.load(join(results_dir, db_file))
        seq_ids = {s.metadata['id'] for s in seq_ids.view(DNAIterator)}
        tax_weights = tax_weights.view(Series)
        tax_weights = {tax_weights[sid]:0. for sid in tax_weights.index
                       if sid in seq_ids}

        weights = Artifact.load(join(mockdir, 'feature_table.qza'))
        weights = weights.view(DataFrame)
        if len(weights.index) > 1:
            weights = {s:sum(weights.loc[s]) for s in weights.index}
            total = sum(weights.values())
            weights = {s:w/total for s, w in weights.items()}
        else:
            weights = {weights.index[0]: 1.}

        et_path = join(precomputed_dir, mock)
        if db_name.startswith('gg_13_8_otus'):
            et_path = join(et_path, 'gg_13_8_otus')
        else:
            et_path = join(et_path, 'unite_20.11.2016_clean_fullITS')
        et_path = join(et_path, 'expected', 'expected-taxonomy.tsv')
        with open(et_path) as tf:
            reader = csv.DictReader(tf, delimiter='\t')
            for row in reader:
                tax = row['Taxonomy']
                weight = sum(weights[s]*float(row[s]) for s in weights)
                try:
                    tax_weights[tax] += weight
                except KeyError:
                    species = {t for t in tax_weights if t.startswith(tax)}
                    if len(species) == 0:
                        print(tax)
                    else:
                        for s in species:
                            tax_weights[s] += weight/len(species)

        for tax in tax_weights:
            if tax_weights[tax] < 1e-9:
                tax_weights[tax] = 1e-9
        total = sum(tax_weights.values())

        weights = [tax_weights[t]/total for t in sorted(tax_weights)]
        filename = mock + '-' + db_name + '-weights.json'
        weights_file = join(weights_dest, filename)
        priors_files[mock] = priors_files.get(mock, {})
        priors_files[mock][db_name] = weights_file
        weights_file = join(results_dir, weights_file)
        with open(weights_file, 'w') as wf:
            json.dump(weights, wf)


mock-3
gg_13_8_otus_full
gg_13_8_otus_read
gg_13_8_otus_amplicon
mock-12
gg_13_8_otus_full
gg_13_8_otus_read
gg_13_8_otus_amplicon
mock-18
gg_13_8_otus_full
gg_13_8_otus_read
gg_13_8_otus_amplicon
mock-22
gg_13_8_otus_full
gg_13_8_otus_read
gg_13_8_otus_amplicon
mock-24
unite_20.11.2016_clean_full
unite_20.11.2016_clean_read
k__Fungi;p__Chytridiomycota;c__Chytridiomycetes;o__Spizellomycetales;f__Spizellomycetaceae;g__Spizellomyces;s__Spizellomyces_punctatus
unite_20.11.2016_clean_amplicon
k__Fungi;p__Chytridiomycota;c__Chytridiomycetes;o__Spizellomycetales;f__Spizellomycetaceae;g__Spizellomyces;s__Spizellomyces_punctatus
mock-26-ITS1
unite_20.11.2016_clean_full
unite_20.11.2016_clean_read
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Cantharellales;f__Hydnaceae;g__Sistotrema;s__Sistotrema_brinkmannii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Debaryomyces;s__Debaryomyces_prosopidis
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Amanitaceae;g__Amanita;s__Amanita_lividopallescens
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Ascomycota;c__Pezizomycetes;o__Pezizales;f__Rhizinaceae;g__Rhizina;s__Rhizina_undulata
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens
unite_20.11.2016_clean_amplicon
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Cantharellales;f__Hydnaceae;g__Sistotrema;s__Sistotrema_brinkmannii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Debaryomyces;s__Debaryomyces_prosopidis
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Amanitaceae;g__Amanita;s__Amanita_lividopallescens
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Ascomycota;c__Pezizomycetes;o__Pezizales;f__Rhizinaceae;g__Rhizina;s__Rhizina_undulata
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens
mock-26-ITS9
unite_20.11.2016_clean_full
unite_20.11.2016_clean_read
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens
unite_20.11.2016_clean_amplicon
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens

Classifier Parameter Sweeps


In [6]:
nb_extra_sweep = \
    {'feat-ext--n-features': [1024, 8192, 65536],
     'feat-ext--ngram-range': [[4,4], [8, 8], [16, 16], [4,16]],
     'norm--norm': ['l1', 'l2', None],
     'norm--use-idf': [True, False],
     'classify--alpha': [0.001, 0.01, 0.1],
     'classify--class-prior': ['uniform', 'prior']}
        
classifier_params = {'nb-extra': nb_extra_sweep}

confidences = [0., 0.2, 0.4, 0.6, 0.8]

Classifier fitting scripts


In [7]:
def get_classifier_command(method, inputs, params, priors):
    cmd = ['qiime feature-classifier fit-classifier-' + method]
    cls = [method]
    
    for param in sorted(inputs):
        value = inputs[param]
        cmd.extend(['--i-' + param, value])
        cls.append(os.path.basename(value).split('.')[0])
    
    for param in sorted(params):
        value = params[param]
        if value == 'prior':
            cls.append(os.path.basename(priors).split('.')[0])
        else:
            cls.append(str(value).replace(' ',''))
        
        if type(value) is bool:
            cmd.append('--p-' + ('' if value else 'no-') + param)
            continue
        
        if 'class-prior' not in param:
            value = json.dumps(value)
            if value[0] != '"' or value[-1] != '"':
                value = '"' + value + '"'
            cmd.extend(['--p-' + param, value])
            continue
            
        if value == 'uniform':
            continue
            
        cmd.extend(['--p-' + param, '"`cat ' + priors + '`"'])
    
    cls = ':'.join(cls) + '.qza'
    cls = os.path.sep + join('state', 'partition1', 'tmp', 'classifiers', cls)
    
    cmd.extend(['--o-classifier', '"' + cls + '"'])
    cmd = ' '.join(cmd)
    
    return cls, cmd

def get_classify_command(classifier, reads, params, 
                         confidence, directory, results_dir):
    cmd = ['qiime feature-classifier classify-sklearn']
    cmd.extend(['--i-classifier', '"' + classifier + '"'])
    cmd.extend(['--i-reads', reads])
    cmd.extend(['--p-confidence', str(confidence)])
    
    parameters = [str(params[p]).replace(' ', '') for p in sorted(params)]
    parameters.append(str(confidence))
    output_directory = join(directory, ':'.join(parameters))
    if not os.path.exists(join(results_dir, output_directory)):
        os.makedirs(join(results_dir, output_directory))
    output = join(output_directory, 'rep_seqs_tax_assignments.qza')
    cmd.extend(['--o-classification', '"' + output + '"'])
    
    return output, ' '.join(cmd)
    
def get_combinations(params):
    params, values = zip(*params.items())
    for combination in product(*values):
        yield dict(zip(params, combination))

In [8]:
if not os.path.exists(join(results_dir, 'classifiers')):
    os.makedirs(join(results_dir, 'classifiers'))

classifier_commands = set()
classify_commands = []
classifiers = set()
classifications = []
for mock in mock_dirs:
    reads = join('..', 'data', 'mock-community', mock, 'rep_seqs.qza')
    mock_directory = join('classifications', mock)
    inputs = {'reference-taxonomy': taxs[mock]}
    for db_name, db_file in ref_dbs[mock]:
        db_directory = join(mock_directory, db_name)
        inputs['reference-reads'] = db_file
        for method in classifier_params:
            method_directory = join(db_directory, method)
            for params in get_combinations(classifier_params[method]):
                priors = priors_files[mock][db_name]
                classifier, command = get_classifier_command(method, inputs, params, priors)
                classifier_commands.add(command)
                classifiers.add(classifier)
                for confidence in confidences:
                    classification, command = get_classify_command(
                        classifier, reads, params, confidence,
                        method_directory, results_dir)
                    classifications.append(classification)
                    classify_commands.append(command)
                
# write out the commands
with open(join(results_dir, 'classifier_commands.sh'), 'w') as cmds:
    for cmd in classifier_commands:
        cmds.write(cmd + '\n')
with open(join(results_dir, 'classify_commands.sh'), 'w') as cmds:
    for cmd in classify_commands:
        cmds.write(cmd + '\n')

Additional files for tax-credit


In [9]:
bad_classifications = []
for classification in classifications:
    full_classification = join(results_dir, classification)
    output_dir = os.path.dirname(full_classification)

    taxonomy_map_fp = join(output_dir, 'taxonomy.tsv')      
    if not os.path.exists(taxonomy_map_fp):
        try:
            Artifact.load(full_classification).export_data(output_dir)
        except ValueError:
            bad_classifications.append(classification)

In [18]:
print(len(bad_classifications), "bad classifications")
bc_combinations = None
for bc in bad_classifications:
    if '[4,16]' not in bc:
        print(bc)
        continue
    sbc = []
    for tbc in bc.split(os.path.sep):
        sbc.extend(tbc.split(':'))
    if bc_combinations is None:
        bc_combinations = [{tbc} for tbc in sbc]
    else:
        for tbc, bcc in zip(sbc, bc_combinations):
            bcc.add(tbc)
print(bc_combinations)


742 bad classifications
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[4,4]:l1:True:0.0/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[8,8]:l1:True:0.4/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[4,4]:l2:True:0.0/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.01:uniform:65536:[4,4]:l2:True:0.6/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.1:uniform:65536:[4,4]:l2:True:0.0/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[16,16]:l2:False:0.8/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.01:uniform:65536:[4,4]:None:True:0.2/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.01:uniform:65536:[4,4]:None:True:0.4/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[8,8]:None:True:0.4/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS1/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[16,16]:None:False:0.4/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS9/unite_20.11.2016_clean_full/nb-extra/0.001:uniform:65536:[8,8]:l1:True:0.0/rep_seqs_tax_assignments.qza
classifications/mock-26-ITS9/unite_20.11.2016_clean_full/nb-extra/0.1:uniform:65536:[8,8]:l1:True:0.4/rep_seqs_tax_assignments.qza
[{'classifications'}, {'mock-22', 'mock-3', 'mock-12', 'mock-18'}, {'gg_13_8_otus_full'}, {'nb-extra'}, {'0.001', '0.01', '0.1'}, {'uniform', 'prior'}, {'65536', '8192'}, {'[4,16]'}, {'l2', 'None', 'l1'}, {'True', 'False'}, {'0.6', '0.8', '0.2', '0.4', '0.0'}, {'rep_seqs_tax_assignments.qza'}]

In [19]:
taxonomy_glob = join(results_dir, 'classifications', 'mock-*', '*', 'nb-extra', '*', 'taxonomy.tsv')
generate_per_method_biom_tables(taxonomy_glob, data_dir)

In [20]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", analysis_name)
method_dirs = glob.glob(join(results_dir, 'classifications', 'mock-*', '*', 'nb-extra', '*'))
move_results_to_repository(method_dirs, precomputed_results_dir)

In [ ]: