In [1]:
f = open('test.biom','w')
f.write("""{
"columns": [
{
"id": "Sample1",
"metadata": {
"BarcodeSequence": "AGCACGAGCCTA",
"DOB": 20060805
}
},
{
"id": "Sample2",
"metadata": {
"BarcodeSequence": "AACTCGTCGATG",
"DOB": 20060216
}
},
{
"id": "Sample3",
"metadata": {
"BarcodeSequence": "ACAGACCACTCA",
"DOB": 20060109
}
},
{
"id": "Sample4",
"metadata": {
"BarcodeSequence": "ACCAGCGACTAG",
"DOB": 20070530
}
},
{
"id": "Sample5",
"metadata": {
"BarcodeSequence": "AGCAGCACTTGT",
"DOB": 20070101
}
},
{
"id": "Sample6",
"metadata": {
"BarcodeSequence": "AGCAGCACAACT",
"DOB": 20070716
}
}
],
"data": [
[0, 2, 1.0],
[1, 0, 5.0],
[1, 1, 1.0],
[1, 3, 2.0],
[1, 4, 3.0],
[1, 5, 1.0],
[2, 2, 1.0],
[2, 3, 4.0],
[2, 5, 2.0],
[3, 0, 2.0],
[3, 1, 1.0],
[3, 2, 1.0],
[3, 5, 1.0],
[4, 1, 1.0],
[4, 2, 1.0]
],
"date": "2012-12-11T07:30:29.870689",
"format": "Biological Observation Matrix 1.0.0",
"format_url": "http://biom-format.org",
"generated_by": "some software package",
"id": null,
"matrix_element_type": "float",
"matrix_type": "sparse",
"rows": [
{
"id": "GG_OTU_1",
"metadata": {
"confidence": 0.665,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_2",
"metadata": {
"confidence": 0.98,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce1"]
}
},
{
"id": "GG_OTU_3",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_4",
"metadata": {
"confidence": 0.842,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_5",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce2"]
}
}
],
"shape": [5, 6],
"type": "OTU table"
}""")
f.close()
In [2]:
from biom import load_table
t = load_table('test.biom')
for e in t.observation_metadata:
print e['taxonomy']
When collapsing with Table.collapse, there is only one resulting OTU. The two families that were associated with only one OTU each are dropped.
In [3]:
def collapse_on_family(id_, md):
return ';'.join(md['taxonomy'][:6])
t_biom_collapsed = t.collapse(collapse_on_family, axis='observation')
print t_biom_collapsed
However, when we collapse with QIIME those are retained, which is the expected behavior.
In [4]:
!summarize_taxa.py -i test.biom -o summarize_taxa_out/
t_qiime_collapsed = load_table('summarize_taxa_out/test_L6.biom')
print t_qiime_collapsed
Confirming that with an alternative collapsing function we observe the same incorrect result. Here I define collapse_f as in the BIOM docs here though note that there are some unrelated issues with that approach.
In [5]:
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation')
print alt_t_biom_collapsed
The solution here is to pass min_group_size=1. I think this should be the default (see here for further discussion).
In [6]:
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation', min_group_size=1)
print alt_t_biom_collapsed
For the records:
In [7]:
!print_qiime_config.py
In [7]: