In [1]:
from __future__ import division
from os.path import join, expandvars
import gzip
import pandas as pd
from skbio.parse.sequences import parse_fasta
In [2]:
project_dir = expandvars("$HOME/data/short-read-tax-assignment")
data_dir = join(project_dir, "data")
reference_database_dir = expandvars("$HOME/data/")
In [3]:
num_iterations = 5
dataset_reference_combinations = []
for iteration in range(num_iterations):
dataset_reference_combinations.append(('B1-iter%d' % iteration, 'gg_13_8_otus'))
dataset_reference_combinations.append(('B2-iter%d' % iteration, 'gg_13_8_otus'))
dataset_reference_combinations.append(('F1-iter%d' % iteration, 'unite-97-rep-set'))
dataset_reference_combinations.append(('F2-iter%d' % iteration, 'unite-97-rep-set'))
reference_dbs = {'gg_13_8_otus' : (join(reference_database_dir, 'gg_13_8_otus/rep_set/97_otus.fasta'),
join(reference_database_dir, 'gg_13_8_otus/taxonomy/97_otu_taxonomy.txt')),
'unite-97-rep-set' : (join(reference_database_dir, 'unite-14.11/97_otus.fasta'),
join(reference_database_dir, 'unite-14.11/97_otu_taxonomy.txt'))}
In [4]:
def get_tax_ids(tax_fp, level=7, ids_to_keep=None):
result = set()
for line in open(tax_fp):
id_, tax = line.strip().split('\t')
if ids_to_keep is not None and id_ not in ids_to_keep:
continue
else:
t = tuple([e.strip() for e in tax.split(';')[:level]])
result.add(t)
return result
In [6]:
data = []
for level in range(1,8):
for e in dataset_reference_combinations:
all_tax_ids = get_tax_ids(reference_dbs[e[1]][1], level=level)
zipped_refseqs_fp = join(data_dir, 'simulated-community', e[0], 'ref.fna.gz')
ids_to_keep = []
for rec in parse_fasta(gzip.open(zipped_refseqs_fp, 'rb')):
ids_to_keep.append(rec[0])
ref_tax_ids = get_tax_ids(reference_dbs[e[1]][1], level=level, ids_to_keep=ids_to_keep)
dataset, iter_num = e[0].split('-')
data.append([dataset, iter_num, level, len(all_tax_ids), len(ref_tax_ids), 1. - (len(ref_tax_ids)/len(all_tax_ids))])
df = pd.DataFrame(data, columns=['Dataset', 'Iteration', 'Taxonomic level', 'All taxonomic ids', 'Partial reference taxonomic ids', 'Fraction of taxonomic ids not represented in reference'])
In [7]:
df
Out[7]:
In [12]:
df.groupby(['Dataset', 'Taxonomic level']).mean()
Out[12]:
In [8]:
df.to_csv(join(project_dir, 'ipynb', 'tables', 'partial-reference-database-content-summaries.csv'))
In [ ]: