In [1]:
from __future__ import division
from os.path import join, expandvars
import gzip

import pandas as pd
from skbio.parse.sequences import parse_fasta

In [2]:
project_dir = expandvars("$HOME/data/short-read-tax-assignment")
data_dir = join(project_dir, "data")
reference_database_dir = expandvars("$HOME/data/")

In [3]:
num_iterations = 5
dataset_reference_combinations = []
for iteration in range(num_iterations):
    dataset_reference_combinations.append(('B1-iter%d' % iteration, 'gg_13_8_otus'))
    dataset_reference_combinations.append(('B2-iter%d' % iteration, 'gg_13_8_otus'))
    dataset_reference_combinations.append(('F1-iter%d' % iteration, 'unite-97-rep-set'))
    dataset_reference_combinations.append(('F2-iter%d' % iteration, 'unite-97-rep-set'))

reference_dbs = {'gg_13_8_otus' : (join(reference_database_dir, 'gg_13_8_otus/rep_set/97_otus.fasta'), 
                                   join(reference_database_dir, 'gg_13_8_otus/taxonomy/97_otu_taxonomy.txt')),
                 'unite-97-rep-set' : (join(reference_database_dir, 'unite-14.11/97_otus.fasta'), 
                                       join(reference_database_dir, 'unite-14.11/97_otu_taxonomy.txt'))}

In [4]:
def get_tax_ids(tax_fp, level=7, ids_to_keep=None):
    result = set()
    for line in open(tax_fp):
        id_, tax = line.strip().split('\t')
        if ids_to_keep is not None and id_ not in ids_to_keep:
            continue
        else:
            t = tuple([e.strip() for e in tax.split(';')[:level]])
            result.add(t)
    return result

In [6]:
data = []
for level in range(1,8):
    for e in dataset_reference_combinations:
        all_tax_ids = get_tax_ids(reference_dbs[e[1]][1], level=level)
        zipped_refseqs_fp = join(data_dir, 'simulated-community', e[0], 'ref.fna.gz')
        ids_to_keep = []
        for rec in parse_fasta(gzip.open(zipped_refseqs_fp, 'rb')):
            ids_to_keep.append(rec[0])
        ref_tax_ids = get_tax_ids(reference_dbs[e[1]][1], level=level, ids_to_keep=ids_to_keep)
        dataset, iter_num = e[0].split('-')
        data.append([dataset, iter_num, level, len(all_tax_ids), len(ref_tax_ids), 1. - (len(ref_tax_ids)/len(all_tax_ids))])

df = pd.DataFrame(data, columns=['Dataset', 'Iteration', 'Taxonomic level', 'All taxonomic ids', 'Partial reference taxonomic ids', 'Fraction of taxonomic ids not represented in reference'])

In [7]:
df


Out[7]:
Dataset Iteration Taxonomic level All taxonomic ids Partial reference taxonomic ids Fraction of taxonomic ids not represented in reference
0 B1 iter0 1 2 2 0.000000
1 B2 iter0 1 2 2 0.000000
2 F1 iter0 1 1 1 0.000000
3 F2 iter0 1 1 1 0.000000
4 B1 iter1 1 2 2 0.000000
5 B2 iter1 1 2 2 0.000000
6 F1 iter1 1 1 1 0.000000
7 F2 iter1 1 1 1 0.000000
8 B1 iter2 1 2 2 0.000000
9 B2 iter2 1 2 2 0.000000
10 F1 iter2 1 1 1 0.000000
11 F2 iter2 1 1 1 0.000000
12 B1 iter3 1 2 2 0.000000
13 B2 iter3 1 2 2 0.000000
14 F1 iter3 1 1 1 0.000000
15 F2 iter3 1 1 1 0.000000
16 B1 iter4 1 2 2 0.000000
17 B2 iter4 1 2 2 0.000000
18 F1 iter4 1 1 1 0.000000
19 F2 iter4 1 1 1 0.000000
20 B1 iter0 2 91 91 0.000000
21 B2 iter0 2 91 90 0.010989
22 F1 iter0 2 9 9 0.000000
23 F2 iter0 2 9 9 0.000000
24 B1 iter1 2 91 91 0.000000
25 B2 iter1 2 91 90 0.010989
26 F1 iter1 2 9 9 0.000000
27 F2 iter1 2 9 9 0.000000
28 B1 iter2 2 91 91 0.000000
29 B2 iter2 2 91 91 0.000000
... ... ... ... ... ... ...
110 F1 iter2 6 2771 2703 0.024540
111 F2 iter2 6 2771 2690 0.029231
112 B1 iter3 6 2930 2867 0.021502
113 B2 iter3 6 2930 2871 0.020137
114 F1 iter3 6 2771 2700 0.025623
115 F2 iter3 6 2771 2693 0.028149
116 B1 iter4 6 2930 2861 0.023549
117 B2 iter4 6 2930 2860 0.023891
118 F1 iter4 6 2771 2687 0.030314
119 F2 iter4 6 2771 2692 0.028510
120 B1 iter0 7 4163 4008 0.037233
121 B2 iter0 7 4163 4004 0.038194
122 F1 iter0 7 19611 18442 0.059609
123 F2 iter0 7 19611 18451 0.059150
124 B1 iter1 7 4163 4014 0.035791
125 B2 iter1 7 4163 3987 0.042277
126 F1 iter1 7 19611 18447 0.059354
127 F2 iter1 7 19611 18432 0.060119
128 B1 iter2 7 4163 4010 0.036752
129 B2 iter2 7 4163 4011 0.036512
130 F1 iter2 7 19611 18519 0.055683
131 F2 iter2 7 19611 18519 0.055683
132 B1 iter3 7 4163 4003 0.038434
133 B2 iter3 7 4163 4020 0.034350
134 F1 iter3 7 19611 18459 0.058743
135 F2 iter3 7 19611 18487 0.057315
136 B1 iter4 7 4163 3995 0.040356
137 B2 iter4 7 4163 4019 0.034590
138 F1 iter4 7 19611 18458 0.058794
139 F2 iter4 7 19611 18509 0.056193

140 rows × 6 columns


In [12]:
df.groupby(['Dataset', 'Taxonomic level']).mean()


Out[12]:
All taxonomic ids Partial reference taxonomic ids Fraction of taxonomic ids not represented in reference
Dataset Taxonomic level
B1 1 2 2.0 0.000000
2 91 91.0 0.000000
3 319 318.8 0.000627
4 664 662.2 0.002711
5 1116 1112.0 0.003584
6 2930 2862.6 0.023003
7 4163 4006.0 0.037713
B2 1 2 2.0 0.000000
2 91 90.6 0.004396
3 319 318.0 0.003135
4 664 662.2 0.002711
5 1116 1111.8 0.003763
6 2930 2864.4 0.022389
7 4163 4008.2 0.037185
F1 1 1 1.0 0.000000
2 9 9.0 0.000000
3 43 42.0 0.023256
4 158 155.6 0.015190
5 540 529.4 0.019630
6 2771 2689.8 0.029304
7 19611 18465.0 0.058437
F2 1 1 1.0 0.000000
2 9 9.0 0.000000
3 43 42.6 0.009302
4 158 157.4 0.003797
5 540 533.2 0.012593
6 2771 2693.0 0.028149
7 19611 18479.6 0.057692

In [8]:
df.to_csv(join(project_dir, 'ipynb', 'tables', 'partial-reference-database-content-summaries.csv'))

In [ ]: