In [1]:
import glob
from os.path import join
import os
import csv
import shutil
import json
from itertools import product

from qiime2 import Artifact
from qiime2.plugins import feature_classifier
from q2_types.feature_data import DNAIterator
from q2_feature_classifier.classifier import \
    spec_from_pipeline, pipeline_from_spec, _register_fitter
from pandas import DataFrame, Series

from tax_credit.framework_functions import \
    generate_per_method_biom_tables, move_results_to_repository

File Paths and Communities


In [2]:
project_dir = join('..', '..')
analysis_name = 'mock-community'
data_dir = join(project_dir, 'data', analysis_name)
precomputed_dir = join(project_dir, 'data', 'precomputed-results', analysis_name)

ref_db_dir = join(project_dir, 'data', 'ref_dbs')

gg_db = join(ref_db_dir, 'gg_13_8_otus/99_otus.fasta')
gg_tax = join(ref_db_dir, 'gg_13_8_otus/99_otu_taxonomy.txt')
unite_db = join(ref_db_dir, 'unite_20.11.2016/sh_refs_qiime_ver7_99_20.11.2016_dev_clean.fasta')
unite_tax = join(ref_db_dir, 'unite_20.11.2016/sh_taxonomy_qiime_ver7_99_20.11.2016_dev_clean.tsv')

results_dir = join(project_dir, 'temp_results_narrow')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

mock_dirs = ['mock-' + str(m) for m in 
             list(range(1, 11)) + list(range(12,17)) +
             list(range(18,26)) + ['26-ITS1', '26-ITS9']]

Import Reference Databases


In [3]:
ref_dest = 'ref_dbs'

ref_16S = join(ref_dest, '99_gg_seq.qza')
ref_ITS = join(ref_dest, '99_unite_seq.qza')
tax_16S = join(ref_dest, '99_gg_tax.qza')
tax_ITS = join(ref_dest, '99_unite_tax.qza')

if not os.path.exists(join(results_dir, ref_dest)):
    os.makedirs(join(results_dir, ref_dest))
ref = Artifact.import_data('FeatureData[Sequence]', gg_db)
ref.save(join(results_dir, ref_16S))
tax = Artifact.import_data('FeatureData[Taxonomy]', gg_tax,
                           view_type='HeaderlessTSVTaxonomyFormat')
tax.save(join(results_dir, tax_16S))
ref = Artifact.import_data('FeatureData[Sequence]', unite_db)
ref.save(join(results_dir, ref_ITS))
tax = Artifact.import_data('FeatureData[Taxonomy]', unite_tax,
                           view_type='HeaderlessTSVTaxonomyFormat')
tax.save(join(results_dir, tax_ITS));

Amplicon Extraction


In [4]:
ref_dbs = {}
taxs = {}
for mock in mock_dirs:
    mockdir = join(data_dir, mock)
    primer_file = join(mockdir, 'sample-metadata.tsv')
    with open(primer_file) as csvfile:
        data = next(csv.DictReader(csvfile, delimiter='\t'))
    primers = [data['LinkerPrimerSequence'], data['ReversePrimer']]
    sample_type = 'ITS' if 'ITS' in data['PrimerName'] else '16S'
    assert sample_type == 'ITS' or '515f' in data['PrimerName']
    
    if sample_type == '16S':
        ref_dbs[mock] = [('gg_13_8_otus_full', ref_16S)]
        taxs[mock] = tax_16S
        ref = ref_16S
        db_name = 'gg_13_8_otus_amplicon'
    else:
        ref_dbs[mock] = [('unite_20.11.2016_clean_full', ref_ITS)]
        taxs[mock] = tax_ITS
        ref = ref_ITS
        db_name = 'unite_20.11.2016_clean_amplicon'
    
    if primers[0] == 'CCGTGCCAGCMGCCGCGGTAA':
        primers[0] = 'GTGCCAGCMGCCGCGGTAA'
    elif primers[0] == 'ATCTTGGTCATTTAGAGGAAGTAA':
        primers[0] = 'CTTGGTCATTTAGAGGAAGTAA'
    elif 'I' in primers[0]:
        primers[0] = primers[0].replace('I', 'N')
    
    db_file = '_'.join(
        [ref.rsplit('.',1)[0]] + 
        list(primers)) + '.qza'
    ref_dbs[mock].append((db_name, db_file))
    db_file = join(results_dir, db_file)
    if not os.path.exists(db_file):
        seqs = Artifact.load(join(results_dir, ref))
        trimmed = feature_classifier.methods.extract_reads(
                    sequences=seqs, f_primer=primers[0], r_primer=primers[1]).reads
        trimmed.save(db_file)

Find Class Weights

map taxonomies to taxonomy labels using tax-credit/data/mock-community/mock-3/expected-taxonomy.tsv


In [8]:
weights_dest = 'weights'
if not os.path.exists(join(results_dir, weights_dest)):
    os.makedirs(join(results_dir, weights_dest))

priors_files = {}
for mock in mock_dirs:
    print(mock)
    mockdir = join(data_dir, mock)
    for db_name, db_file in ref_dbs[mock]:
        print(db_name)
        tax_weights = Artifact.load(join(results_dir, taxs[mock]))
        seq_ids = Artifact.load(join(results_dir, db_file))
        seq_ids = {s.metadata['id'] for s in seq_ids.view(DNAIterator)}
        tax_weights = tax_weights.view(Series)
        tax_weights = {tax_weights[sid]:0. for sid in tax_weights.index
                       if sid in seq_ids}

        weights = Artifact.load(join(mockdir, 'feature_table.qza'))
        weights = weights.view(DataFrame)
        if len(weights.index) > 1:
            weights = {s:sum(weights.loc[s]) for s in weights.index}
            total = sum(weights.values())
            weights = {s:w/total for s, w in weights.items()}
        else:
            weights = {weights.index[0]: 1.}

        et_path = join(precomputed_dir, mock)
        if db_name.startswith('gg_13_8_otus'):
            et_path = join(et_path, 'gg_13_8_otus')
        else:
            et_path = join(et_path, 'unite_20.11.2016_clean_fullITS')
        et_path = join(et_path, 'expected', 'expected-taxonomy.tsv')
        with open(et_path) as tf:
            reader = csv.DictReader(tf, delimiter='\t')
            for row in reader:
                tax = row['Taxonomy']
                weight = sum(weights[s]*float(row[s]) for s in weights)
                try:
                    tax_weights[tax] += weight
                except KeyError:
                    species = {t for t in tax_weights if t.startswith(tax)}
                    if len(species) == 0:
                        print(tax)
                    else:
                        for s in species:
                            tax_weights[s] += weight/len(species)

        for tax in tax_weights:
            if tax_weights[tax] < 1e-9:
                tax_weights[tax] = 1e-9
        total = sum(tax_weights.values())

        weights = [tax_weights[t]/total for t in sorted(tax_weights)]
        filename = mock + '-' + db_name + '-weights.json'
        weights_file = join(weights_dest, filename)
        priors_files[mock] = priors_files.get(mock, {})
        priors_files[mock][db_name] = weights_file
        weights_file = join(results_dir, weights_file)
        with open(weights_file, 'w') as wf:
            json.dump(weights, wf)


mock-1
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-2
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-3
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-4
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-5
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-6
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-7
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-8
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-9
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetaceae;g__Kluyveromyces;s__Kluyveromyces_lactis
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetaceae;g__Zygosaccharomyces;s__Zygosaccharomyces_rouxii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Pichiaceae;g__Hyphopichia;s__Hyphopichia_burtonii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Candida;s__Candida_catenulata
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Cyberlindnera;s__Cyberlindnera_jadinii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Debaryomyces;s__Debaryomyces_hansenii
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Microascales;f__Microascaceae;g__Scopulariopsis
k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Trichocomaceae;g__Penicillium;s__Penicillium_commune
mock-10
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetaceae;g__Kluyveromyces;s__Kluyveromyces_lactis
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetaceae;g__Zygosaccharomyces;s__Zygosaccharomyces_rouxii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Pichiaceae;g__Hyphopichia;s__Hyphopichia_burtonii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Candida;s__Candida_catenulata
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Cyberlindnera;s__Cyberlindnera_jadinii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Debaryomyces;s__Debaryomyces_hansenii
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Microascales;f__Microascaceae;g__Scopulariopsis
k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Trichocomaceae;g__Penicillium;s__Penicillium_commune
mock-12
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-13
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-14
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-15
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-16
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-18
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-19
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-20
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-21
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-22
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-23
gg_13_8_otus_full
gg_13_8_otus_amplicon
mock-24
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Chytridiomycota;c__Chytridiomycetes;o__Spizellomycetales;f__Spizellomycetaceae;g__Spizellomyces;s__Spizellomyces_punctatus
mock-25
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Chytridiomycota;c__Chytridiomycetes;o__Spizellomycetales;f__Spizellomycetaceae;g__Spizellomyces;s__Spizellomyces_punctatus
mock-26-ITS1
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Cantharellales;f__Hydnaceae;g__Sistotrema;s__Sistotrema_brinkmannii
k__Fungi;p__Ascomycota;c__Saccharomycetes;o__Saccharomycetales;f__Saccharomycetales_fam_Incertae_sedis;g__Debaryomyces;s__Debaryomyces_prosopidis
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Amanitaceae;g__Amanita;s__Amanita_lividopallescens
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Ascomycota;c__Pezizomycetes;o__Pezizales;f__Rhizinaceae;g__Rhizina;s__Rhizina_undulata
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens
mock-26-ITS9
unite_20.11.2016_clean_full
unite_20.11.2016_clean_amplicon
k__Fungi;p__Ascomycota;c__Leotiomycetes;o__Helotiales;f__Helotiaceae;g__Hymenoscyphus;s__Hymenoscyphus_fraxineus
k__Fungi;p__Ascomycota;c__Archaeorhizomycetes;o__Archaeorhizomycetales;f__Archaeorhizomycetaceae;g__Archaeorhizomyces;s__Archaeorhizomyces_finlayi
k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales;f__Nectriaceae;g__Fusarium;s__Fusarium_poae
k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Agaricales;f__Cortinariaceae;g__Cortinarius;s__Cortinarius_purpurascens

Classifier Parameter Sweeps


In [16]:
nb_sweep = \
    {'feat-ext--analyzer': ['char'],
     'feat-ext--n-features': [8192],
     'feat-ext--ngram-range': 
                     [[4,4], [6,6], [8,8], [16,16], [32,32],
                      [7,7], [9,9], [10,10], [11,11], 
                      [12,12], [14,14], [18,18]],
     'classify--alpha': [0.001]}
nb_bespoke_sweep = \
    {'feat-ext--analyzer': ['char'],
     'feat-ext--n-features': [8192],
     'feat-ext--ngram-range': [[4,4], [6,6], [8,8], [16,16], [32,32],
                      [7,7], [9,9], [10,10], [11,11], 
                      [12,12], [14,14], [18,18]],
     'classify--alpha': [0.001],
     'classify--class-prior': ['prior']}
    
classifier_params = {'naive-bayes': nb_sweep,
                     'naive-bayes-bespoke': nb_bespoke_sweep}

confidences = [0.0, 0.5, 0.7, 0.9, 0.92, 0.94,
               0.96, 0.98, 1.0]

Classifier fitting scripts


In [17]:
def get_classifier_command(method, inputs, params, priors):
    cmd = ['qiime feature-classifier fit-classifier-naive-bayes']
    cls = [method]
    
    for param in sorted(inputs):
        value = inputs[param]
        cmd.extend(['--i-' + param, value])
        cls.append(os.path.basename(value).split('.')[0])
    
    for param in sorted(params):
        value = params[param]
        if value == 'prior':
            cls.append(os.path.basename(priors).split('.')[0])
        else:
            cls.append(str(value).replace(' ',''))
        
        if type(value) is bool:
            cmd.append('--p-' + ('' if value else 'no-') + param)
            continue
        
        if 'class-prior' not in param:
            value = json.dumps(value)
            if value[0] != '"' or value[-1] != '"':
                value = '"' + value + '"'
            cmd.extend(['--p-' + param, value])
            continue
            
        if value == 'uniform':
            continue
            
        cmd.extend(['--p-' + param, '"`cat ' + priors + '`"'])
    
    cls = ':'.join(cls) + '.qza'
    cls = os.path.sep + join('state', 'partition1', 'tmp', 'classifiers_narrow', cls)
    
    cmd.extend(['--o-classifier', '"' + cls + '"'])
    cmd = ' '.join(cmd)
    
    return cls, cmd

def get_classify_command(classifier, reads, params, 
                         confidence, directory, results_dir):
    cmd = ['qiime feature-classifier classify-sklearn']
    cmd.extend(['--i-classifier', '"' + classifier + '"'])
    cmd.extend(['--i-reads', reads])
    cmd.extend(['--p-confidence', str(confidence)])
    
    parameters = [str(params[p]).replace(' ', '') for p in sorted(params)]
    parameters.append(str(confidence))
    output_directory = join(directory, ':'.join(parameters))
    if not os.path.exists(join(results_dir, output_directory)):
        os.makedirs(join(results_dir, output_directory))
    output = join(output_directory, 'rep_seqs_tax_assignments.qza')
    cmd.extend(['--o-classification', '"' + output + '"'])
    
    return output, ' '.join(cmd)
    
def get_combinations(params):
    params, values = zip(*params.items())
    for combination in product(*values):
        yield dict(zip(params, combination))

In [19]:
if not os.path.exists(join(results_dir, 'classifiers')):
    os.makedirs(join(results_dir, 'classifiers'))

classifier_commands = set()
classify_commands = []
classifiers = set()
classifications = []
for mock in mock_dirs:
    reads = join('..', 'data', 'mock-community', mock, 'rep_seqs.qza')
    mock_directory = join('classifications', mock)
    inputs = {'reference-taxonomy': taxs[mock]}
    for db_name, db_file in ref_dbs[mock]:
        db_directory = join(mock_directory, db_name)
        inputs['reference-reads'] = db_file
        for method in classifier_params:
            method_directory = join(db_directory, method)
            for params in get_combinations(classifier_params[method]):
                priors = priors_files[mock][db_name]
                classifier, command = get_classifier_command(method, inputs, params, priors)
                classifier_commands.add(command)
                classifiers.add(classifier)
                for confidence in confidences:
                    classification, command = get_classify_command(
                        classifier, reads, params, confidence,
                        method_directory, results_dir)
                    classifications.append(classification)
                    classify_commands.append(command)
                
# write out the commands
with open(join(results_dir, 'classifier_commands.sh'), 'w') as cmds:
    for cmd in classifier_commands:
        cmds.write(cmd + '\n')
with open(join(results_dir, 'classify_commands.sh'), 'w') as cmds:
    for cmd in classify_commands:
        cmds.write(cmd + '\n')

Additional files for tax-credit


In [20]:
bad_classifications = []
for classification in classifications:
    full_classification = join(results_dir, classification)
    output_dir = os.path.dirname(full_classification)

    taxonomy_map_fp = join(output_dir, 'taxonomy.tsv')      
    if not os.path.exists(taxonomy_map_fp):
        try:
            Artifact.load(full_classification).export_data(output_dir)
        except ValueError:
            bad_classifications.append(classification)

In [21]:
print(len(bad_classifications), "bad classifications")
bc_combinations = None
for bc in bad_classifications:
    if '[4,16]' not in bc:
        print(bc)
        continue
    sbc = []
    for tbc in bc.split(os.path.sep):
        sbc.extend(tbc.split(':'))
    if bc_combinations is None:
        bc_combinations = [{tbc} for tbc in sbc]
    else:
        for tbc, bcc in zip(sbc, bc_combinations):
            bcc.add(tbc)
print(bc_combinations)


0 bad classifications
None

In [22]:
taxonomy_glob = join(results_dir, 'classifications', 'mock-*', '*', 'naive-bayes*', '*', 'taxonomy.tsv')
generate_per_method_biom_tables(taxonomy_glob, data_dir)

In [23]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", analysis_name)
method_dirs = glob.glob(join(results_dir, 'classifications', 'mock-*', '*', 'naive-bayes*', '*'))
move_results_to_repository(method_dirs, precomputed_results_dir)

In [ ]: