In [1]:
f = open('test.biom','w')
f.write("""{
    "columns": [
        {
            "id": "Sample1",
            "metadata": {
                "BarcodeSequence": "AGCACGAGCCTA",
                "DOB": 20060805
            }
        },
        {
            "id": "Sample2",
            "metadata": {
                "BarcodeSequence": "AACTCGTCGATG",
                "DOB": 20060216
            }
        },
        {
            "id": "Sample3",
            "metadata": {
                "BarcodeSequence": "ACAGACCACTCA",
                "DOB": 20060109
            }
        },
        {
            "id": "Sample4",
            "metadata": {
                "BarcodeSequence": "ACCAGCGACTAG",
                "DOB": 20070530
            }
        },
        {
            "id": "Sample5",
            "metadata": {
                "BarcodeSequence": "AGCAGCACTTGT",
                "DOB": 20070101
            }
        },
        {
            "id": "Sample6",
            "metadata": {
                "BarcodeSequence": "AGCAGCACAACT",
                "DOB": 20070716
            }
        }
    ],
    "data": [
        [0, 2, 1.0],
        [1, 0, 5.0],
        [1, 1, 1.0],
        [1, 3, 2.0],
        [1, 4, 3.0],
        [1, 5, 1.0],
        [2, 2, 1.0],
        [2, 3, 4.0],
        [2, 5, 2.0],
        [3, 0, 2.0],
        [3, 1, 1.0],
        [3, 2, 1.0],
        [3, 5, 1.0],
        [4, 1, 1.0],
        [4, 2, 1.0]
    ],
    "date": "2012-12-11T07:30:29.870689",
    "format": "Biological Observation Matrix 1.0.0",
    "format_url": "http://biom-format.org",
    "generated_by": "some software package",
    "id": null,
    "matrix_element_type": "float",
    "matrix_type": "sparse",
    "rows": [
        {
            "id": "GG_OTU_1",
            "metadata": {
                "confidence": 0.665,
                "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
            }
        },
        {
            "id": "GG_OTU_2",
            "metadata": {
                "confidence": 0.98,
                "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce1"]
            }
        },
        {
            "id": "GG_OTU_3",
            "metadata": {
                "confidence": 1.0,
                "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
            }
        },
        {
            "id": "GG_OTU_4",
            "metadata": {
                "confidence": 0.842,
                "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
            }
        },
        {
            "id": "GG_OTU_5",
            "metadata": {
                "confidence": 1.0,
                "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce2"]
            }
        }
    ],
    "shape": [5, 6],
    "type": "OTU table"
}""")
f.close()

In [2]:
from biom import load_table

t = load_table('test.biom')
for e in t.observation_metadata:
    print e['taxonomy']


[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae']
[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__OnlyOnce1']
[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae']
[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae']
[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__OnlyOnce2']

When collapsing with Table.collapse, there is only one resulting OTU. The two families that were associated with only one OTU each are dropped.


In [3]:
def collapse_on_family(id_, md): 
    return ';'.join(md['taxonomy'][:6])
t_biom_collapsed = t.collapse(collapse_on_family, axis='observation')

print t_biom_collapsed


# Constructed from biom file
#OTU ID	Sample1	Sample2	Sample3	Sample4	Sample5	Sample6
Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae	0.666666666667	0.333333333333	1.0	1.33333333333	0.0	1.0

However, when we collapse with QIIME those are retained, which is the expected behavior.


In [4]:
!summarize_taxa.py -i test.biom -o summarize_taxa_out/
t_qiime_collapsed = load_table('summarize_taxa_out/test_L6.biom')

print t_qiime_collapsed


# Constructed from biom file
#OTU ID	Sample1	Sample2	Sample3	Sample4	Sample5	Sample6
Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae	0.285714285714	0.333333333333	0.75	0.666666666667	0.0	0.75
Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__OnlyOnce1	0.714285714286	0.333333333333	0.0	0.333333333333	1.0	0.25
Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__OnlyOnce2	0.0	0.333333333333	0.25	0.0	0.0	0.0

Confirming that with an alternative collapsing function we observe the same incorrect result. Here I define collapse_f as in the BIOM docs here though note that there are some unrelated issues with that approach.


In [5]:
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation')
print alt_t_biom_collapsed


# Constructed from biom file
#OTU ID	Sample1	Sample2	Sample3	Sample4	Sample5	Sample6
f__Lachnospiraceae	0.666666666667	0.333333333333	1.0	1.33333333333	0.0	1.0

Solution

The solution here is to pass min_group_size=1. I think this should be the default (see here for further discussion).


In [6]:
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation', min_group_size=1)
print alt_t_biom_collapsed


# Constructed from biom file
#OTU ID	Sample1	Sample2	Sample3	Sample4	Sample5	Sample6
f__OnlyOnce2	0.0	1.0	1.0	0.0	0.0	0.0
f__Lachnospiraceae	0.666666666667	0.333333333333	1.0	1.33333333333	0.0	1.0
f__OnlyOnce1	5.0	1.0	0.0	2.0	3.0	1.0

For the records:


In [7]:
!print_qiime_config.py


System information
==================
         Platform:	darwin
   Python version:	2.7.1 (r271:86832, Aug 30 2012, 10:07:33)  [GCC 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)]
Python executable:	/Users/caporaso/.virtualenvs/qiime/bin/python

Dependency versions
===================
                        NumPy version:	1.8.1
                        SciPy version:	0.14.0
                   matplotlib version:	1.3.1
                  biom-format version:	2.0.1
                         qcli version:	0.1.0
                         pyqi version:	0.3.2
                   scikit-bio version:	0.1.3
                QIIME library version:	1.8.0-dev, master@0668ba7
                 QIIME script version:	1.8.0-dev
        PyNAST version (if installed):	1.2.2
                      Emperor version:	0.9.3
RDP Classifier version (if installed):	rdp_classifier-2.2.jar
          Java version (if installed):	1.6.0_65

QIIME config values
===================
                     blastmat_dir:	/Applications/blast-2.2.22/data/
                         sc_queue:	all.q
   template_alignment_lanemask_fp:	/Users/caporaso/data/greengenes_core_sets/lanemask_in_1s_and_0s.txt
     pynast_template_alignment_fp:	/Users/caporaso/data/greengenes_core_sets/core_set_aligned_imputed.fasta_11_8_07.no_dots
                 seconds_to_sleep:	1
pynast_template_alignment_blastdb:	None
assign_taxonomy_reference_seqs_fp:	/Users/caporaso/data/gg_13_8_otus/rep_set/97_otus.fasta
                     torque_queue:	friendlyq
      topiaryexplorer_project_dir:	/Users/caporaso/code/TopiaryExplorer-0.9.1/
                    jobs_to_start:	4
            denoiser_min_per_core:	50
                  cluster_jobs_fp:	start_parallel_jobs.py
assign_taxonomy_id_to_taxonomy_fp:	/Users/caporaso/data/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt
                         temp_dir:	/Users/caporaso/temp
                      blastall_fp:	blastall

In [7]: