Using this table from the short-read-tax-assignment repo, Jenya noticed that summarize_taxa.py gives different results from BIOM 1.x.x's collapseObservationsByMetadata. I previously confirmed that, but now also observe the issue with BIOM 2.0.1.
In [1]:
from biom import load_table
t1 = load_table('table.biom')
In [2]:
level = 6
def collapse_f(id_, md):
return ';'.join(md['taxonomy'][:level])
collapsed_t1 = t1.collapse(collapse_f, axis='observation')
len(collapsed_t1.observation_ids)
Out[2]:
In [3]:
!summarize_taxa.py -i table.biom -o summarize_taxa_out/
t2 = load_table('summarize_taxa_out/table_L6.biom')
len(t2.observation_ids)
Out[3]:
It turns out that you need to pass min_group_size=1 to get this to work as expected.
In [4]:
level = 6
def collapse_f(id_, md):
return ';'.join(md['taxonomy'][:level])
collapsed_t1 = t1.collapse(collapse_f, axis='observation', min_group_size=1)
len(collapsed_t1.observation_ids)
Out[4]:
Everything below here was experiments to try to figure this out...
In [5]:
from os.path import exists
if not exists('table.from_biom_w_taxonomy.txt'):
!biom convert -i table.biom -o table.from_biom_w_taxonomy.txt --to-tsv --header-key taxonomy
import pandas as pd
t3 = pd.read_csv('table.from_biom_w_taxonomy.txt', sep='\t', skiprows=2, index_col="otu-id", names=['otu-id', 'count', 'taxonomy'])
In [6]:
# All counts are greater than zero
len(t3[t3["count"] == 0])
Out[6]:
In [7]:
t3_taxa = []
for e in t3['taxonomy']:
t = [x.strip() for x in e.split(';')]
t3_taxa.append(';'.join(t[:level]))
print len(set(t3_taxa))
In [8]:
print set(t3_taxa) - set(t2.observation_ids)
print set(t2.observation_ids) - set(t3_taxa)
In [9]:
print set(collapsed_t1.observation_ids) - set(t2.observation_ids)
print set(t2.observation_ids) - set(collapsed_t1.observation_ids)
BIOM 2.0.1's collapseObservationsByMetadata is dropping some low abundance observations, while summarize_taxa.py keeps them. (Note: this behavior was the same in BIOM 1.x.x - data not shown, but ran this same analysis a couple of weeks ago with previous BIOM version.)
In [10]:
set(collapsed_t1.observation_ids) - set(t2.observation_ids)
Out[10]:
In [11]:
missing_ids = set(t2.observation_ids) - set(collapsed_t1.observation_ids)
missing_ids
Out[11]:
When taxa show up only one time, they are not collapsed correctly:
In [12]:
from collections import defaultdict
tax_to_ids = defaultdict(list)
for e in t3.index:
tax = t3['taxonomy'][e].replace(' ','')
try:
s_index = tax.index(';s__')
except ValueError:
continue
tax = tax[:s_index]
tax_to_ids[tax].append(e)
singles = []
for tax, ids in tax_to_ids.items():
if len(ids) == 1:
singles.append(tax)
print set(singles) - set(missing_ids)
print len(singles)
print tax_to_ids['k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Anaerostipes']
print len(t3.index)
print len(set(t3.index))
In [13]:
print t2
In [14]:
r = []
for m in missing_ids:
obs_idx = t2.index(m,axis='observation')
r.append((t2.get_value_by_ids(m,'MockMiSeq.even'), t2.observation_ids[obs_idx]))
r.sort()
for e in r:
print e
In [15]:
r = []
for e in list(t2.iter(axis='observation')):
r.append((e[0][0], e[1]))
r.sort()
for e in r:
print e
In [16]:
!print_qiime_config.py
In [16]:
In [17]:
f = open('test.biom','w')
f.write("""{
"columns": [
{
"id": "Sample1",
"metadata": {
"BarcodeSequence": "AGCACGAGCCTA",
"DOB": 20060805
}
},
{
"id": "Sample2",
"metadata": {
"BarcodeSequence": "AACTCGTCGATG",
"DOB": 20060216
}
},
{
"id": "Sample3",
"metadata": {
"BarcodeSequence": "ACAGACCACTCA",
"DOB": 20060109
}
},
{
"id": "Sample4",
"metadata": {
"BarcodeSequence": "ACCAGCGACTAG",
"DOB": 20070530
}
},
{
"id": "Sample5",
"metadata": {
"BarcodeSequence": "AGCAGCACTTGT",
"DOB": 20070101
}
},
{
"id": "Sample6",
"metadata": {
"BarcodeSequence": "AGCAGCACAACT",
"DOB": 20070716
}
}
],
"data": [
[0, 2, 1.0],
[1, 0, 5.0],
[1, 1, 1.0],
[1, 3, 2.0],
[1, 4, 3.0],
[1, 5, 1.0],
[2, 2, 1.0],
[2, 3, 4.0],
[2, 5, 2.0],
[3, 0, 2.0],
[3, 1, 1.0],
[3, 2, 1.0],
[3, 5, 1.0],
[4, 1, 1.0],
[4, 2, 1.0]
],
"date": "2012-12-11T07:30:29.870689",
"format": "Biological Observation Matrix 1.0.0",
"format_url": "http://biom-format.org",
"generated_by": "some software package",
"id": null,
"matrix_element_type": "float",
"matrix_type": "sparse",
"rows": [
{
"id": "GG_OTU_1",
"metadata": {
"confidence": 0.665,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_2",
"metadata": {
"confidence": 0.98,
"taxonomy": ["Root", "k__Bacteria"]
}
},
{
"id": "GG_OTU_3",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_4",
"metadata": {
"confidence": 0.842,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_5",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce"]
}
}
],
"shape": [5, 6],
"type": "OTU table"
}""")
f.close()
t4 = load_table('test.biom')
for e in t4.observation_metadata:
print e['taxonomy']
In [18]:
def collapse_on_family(id_, md):
return ';'.join(md['taxonomy'][:5])
collapsed_t4 = t4.collapse(collapse_f, axis='observation')
print collapsed_t4
In [18]: