In [15]:
from os.path import join, expandvars
from glob import glob
from qiime.util import count_seqs
import pandas as pd
In [16]:
project_dir = expandvars("$HOME/data/short-read-tax-assignment")
data_dir = join(project_dir, "data")
In [17]:
fasta_fps = glob(join(data_dir, 'mock-community', '*', 'rep_set.fna'))
data = []
for fp in fasta_fps:
fields = fp.split('/')
data_set = fields[-2]
seq_count, mean_length, std_length = count_seqs(fp)
data.append([data_set, seq_count, mean_length, std_length])
df = pd.DataFrame(data, columns=['Dataset', 'Sequence count', 'Mean sequence length', 'Standard deviation sequence length'])
df.sort('Dataset')
Out[17]:
In [18]:
fasta_fps = glob(join(data_dir, 'simulated-community', '*', 'rep_set.fna'))
data = []
for fp in fasta_fps:
fields = fp.split('/')
data_set = fields[-2]
seq_count, mean_length, std_length = count_seqs(fp)
data.append([data_set, seq_count, mean_length, std_length])
df = pd.DataFrame(data, columns=['Dataset', 'Sequence count', 'Mean sequence length', 'Standard deviation sequence length'])
df.sort('Dataset')
Out[18]: