In [15]:
from os.path import join, expandvars
from glob import glob

from qiime.util import count_seqs
import pandas as pd

In [16]:
project_dir = expandvars("$HOME/data/short-read-tax-assignment")
data_dir = join(project_dir, "data")

In [17]:
fasta_fps = glob(join(data_dir, 'mock-community', '*', 'rep_set.fna'))

data = []
for fp in fasta_fps:
    fields = fp.split('/')
    data_set = fields[-2]
    seq_count, mean_length, std_length = count_seqs(fp)
    data.append([data_set, seq_count, mean_length, std_length])

df = pd.DataFrame(data, columns=['Dataset', 'Sequence count', 'Mean sequence length', 'Standard deviation sequence length'])
df.sort('Dataset')


Out[17]:
Dataset Sequence count Mean sequence length Standard deviation sequence length
9 B1 1569 98.621415 6.299267
0 B2 3545 149.989845 0.604551
4 B3 70 192.414286 2.964036
7 B4 13852 150.997329 0.314362
5 B5 7422 235.707895 17.829497
3 B6 4915 92.126755 8.404354
1 B7 2106 98.529915 4.472487
2 B8 2186 97.685270 4.947828
8 F1 94 97.414894 6.536981
6 F2 145 95.579310 7.074645

In [18]:
fasta_fps = glob(join(data_dir, 'simulated-community', '*', 'rep_set.fna'))

data = []
for fp in fasta_fps:
    fields = fp.split('/')
    data_set = fields[-2]
    seq_count, mean_length, std_length = count_seqs(fp)
    data.append([data_set, seq_count, mean_length, std_length])

df = pd.DataFrame(data, columns=['Dataset', 'Sequence count', 'Mean sequence length', 'Standard deviation sequence length'])
df.sort('Dataset')


Out[18]:
Dataset Sequence count Mean sequence length Standard deviation sequence length
1 B1-iter0 9983 302.804568 88.869502
13 B1-iter1 9818 301.616623 84.223041
7 B1-iter2 9874 302.437513 87.613960
2 B1-iter3 9785 302.619724 88.376050
5 B1-iter4 9808 303.111847 89.845702
15 B2-iter0 9750 99.995487 0.445582
18 B2-iter1 9849 99.980100 1.396444
4 B2-iter2 9850 99.980102 1.396373
11 B2-iter3 9815 99.985634 1.075024
16 B2-iter4 9978 99.970435 1.704882
14 F1-iter0 2078 223.153032 114.433364
6 F1-iter1 2076 223.349229 118.296470
0 F1-iter2 1942 223.923790 133.617047
12 F1-iter3 2059 223.295289 119.590061
19 F1-iter4 2052 224.764620 123.799025
8 F2-iter0 2044 95.110568 20.842375
10 F2-iter1 2086 95.406520 20.100100
17 F2-iter2 1989 96.076420 18.631193
9 F2-iter3 2024 95.201581 20.500142
3 F2-iter4 2001 93.942529 23.092024