In [1]:

    
from glob import glob
from os.path import split, splitext
from collections import defaultdict

import pandas as pd
import numpy as np
from scipy import stats
from skbio.draw.distributions import boxplots

data_set_ids = ['88-soils', 'whole-body', 'moving-pictures']
order = ['uc','ucr','ucrC','ucrss','ucrss_wfilter',
         'uc_fast','ucr_fast','ucrC_fast','ucrss_fast','ucrss_fast_wfilter']

method_descriptions = pd.io.parsers.read_csv('raw-tables/method-descriptions.tsv', sep='\t', index_col=0)

write_xls_files = False

Table 1: Method descriptions



In [2]:

    
if write_xls_files: method_descriptions.to_excel('tables/table1.xlsx', na_rep='NA', float_format="%1.3f")
method_descriptions









    Out[2]:






  
    
      
      title
      command
      max_accepts
      max_rejects
      stepwords
      wordlength
      prefilter_percent_id
      min_otu_size
      speed_mode
      processors
      reference_percent_id
      subsample_fraction
    
    
      abbreviation
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      uc
                                                 De novo
                pick_de_novo_otus.py
       20
       500
       20
       12
       NaN
      NaN
       slow
        1
       0.97
         NaN
    
    
      ucr
                                   Legacy open reference
                pick_de_novo_otus.py
       20
       500
       20
       12
       NaN
      NaN
       slow
       10
       0.97
         NaN
    
    
      ucrC
                                        Closed reference
       pick_closed_reference_otus.py
       20
       500
       20
       12
       NaN
      NaN
       slow
       10
       0.97
         NaN
    
    
      ucrss
                               Subsampled open reference
         pick_open_reference_otus.py
       20
       500
       20
       12
       0.0
        1
       slow
       10
       0.97
       0.001
    
    
      ucrss_wfilter
                     Subsampled open reference, filtered
         pick_open_reference_otus.py
       20
       500
       20
       12
       0.6
        1
       slow
       10
       0.97
       0.001
    
    
      uc_fast
                                  De novo, fast settings
                pick_de_novo_otus.py
        1
         8
        8
        8
       NaN
      NaN
       fast
        1
       0.97
         NaN
    
    
      ucr_fast
                    Legacy open reference, fast settings
                pick_de_novo_otus.py
        1
         8
        8
        8
       NaN
      NaN
       fast
       10
       0.97
         NaN
    
    
      ucrC_fast
                         Closed reference, fast settings
       pick_closed_reference_otus.py
        1
         8
        8
        8
       NaN
      NaN
       fast
       10
       0.97
         NaN
    
    
      ucrss_fast
                Subsampled open reference, fast settings
         pick_open_reference_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       10
       0.97
       0.001
    
    
      ucrss_fast_wfilter
       Subsampled open reference, filtered, fast sett...
         pick_open_reference_otus.py
        1
         8
        8
        8
       0.6
        1
       fast
       10
       0.97
       0.001
    
    
      ucr_fast_O29_r82
       Legacy open reference, fast settings, 82% refe...
                pick_de_novo_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       29
       0.82
       0.001
    
    
      ucr_fast_O29_r97
       Legacy open reference, fast settings, 29 proce...
                pick_de_novo_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       29
       0.97
       0.001
    
    
      ucrss_fast_O29_r82
       Subsampled open reference, fast settings, 82% ...
         pick_open_reference_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       29
       0.82
       0.001
    
    
      ucrss_fast_O29_r97
       Subsampled open reference, fast settings, 29 p...
         pick_open_reference_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       29
       0.97
       0.001
    
    
      ucrss_fast_O29_s1
       Subsampled open reference, fast settings, 29 p...
         pick_open_reference_otus.py
        1
         8
        8
        8
       0.0
        1
       fast
       29
       0.97
       0.100

Table 2: Alpha diversity correlations by method



In [3]:

    
adiv_fp_sets = [glob('%s-otus/*/adiv*txt' % id_) for id_ in data_set_ids]
adiv_results = {}
for data_set_id, adiv_fp_set in zip(data_set_ids, adiv_fp_sets):
    df = None
    for fp in adiv_fp_set:
        run_id = fp.split('/')[1]
        new_df = pd.DataFrame.from_csv(fp, sep='\t')
        new_df.columns = ['%s.%s' % (c.lower(), run_id) for c in new_df.columns]
        if df is None:
            df = new_df
        else:
            df = pd.merge(df, new_df,right_index=True,left_index=True)
    for metric in ['pd', 'observed_species']:
        id_ = (data_set_id,metric)
        # this loc-based indexing seems clunky - there must be a better way to 
        # do this in pandas (basically, select all columns where the name contains
        # a string)
        corr_table = df.loc[:,[c for c in df.columns if metric in c]].corr()
        # rename and re-order the rows and columns
        corr_table.index = [c.split('.')[1] for c in corr_table.index]
        corr_table.columns = [c.split('.')[1] for c in corr_table.columns]
        corr_table = corr_table.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
        adiv_results[id_] = corr_table



In [4]:

    
if write_xls_files: adiv_results[('88-soils', 'pd')].to_excel('tables/table2a.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'pd')]









    Out[4]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.950810
       0.932749
       0.933510
       0.952565
       0.955606
       0.936157
       0.926662
       0.947673
       0.946606
    
    
      ucr
       0.950810
       1.000000
       0.902187
       0.930701
       0.930193
       0.945996
       0.940277
       0.903199
       0.952477
       0.943609
    
    
      ucrC
       0.932749
       0.902187
       1.000000
       0.893631
       0.908745
       0.905111
       0.914053
       0.977595
       0.902405
       0.910766
    
    
      ucrss
       0.933510
       0.930701
       0.893631
       1.000000
       0.929412
       0.944127
       0.935030
       0.893611
       0.947867
       0.948666
    
    
      ucrss_wfilter
       0.952565
       0.930193
       0.908745
       0.929412
       1.000000
       0.951712
       0.933274
       0.902546
       0.930598
       0.943046
    
    
      uc_fast
       0.955606
       0.945996
       0.905111
       0.944127
       0.951712
       1.000000
       0.952704
       0.898282
       0.956454
       0.960493
    
    
      ucr_fast
       0.936157
       0.940277
       0.914053
       0.935030
       0.933274
       0.952704
       1.000000
       0.914263
       0.950392
       0.952351
    
    
      ucrC_fast
       0.926662
       0.903199
       0.977595
       0.893611
       0.902546
       0.898282
       0.914263
       1.000000
       0.902216
       0.902507
    
    
      ucrss_fast
       0.947673
       0.952477
       0.902405
       0.947867
       0.930598
       0.956454
       0.950392
       0.902216
       1.000000
       0.962338
    
    
      ucrss_fast_wfilter
       0.946606
       0.943609
       0.910766
       0.948666
       0.943046
       0.960493
       0.952351
       0.902507
       0.962338
       1.000000



In [5]:

    
if write_xls_files: adiv_results[('moving-pictures', 'pd')].to_excel('tables/table2b.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'pd')]









    Out[5]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.995736
       0.992960
       0.995918
       0.995596
       0.994955
       0.995661
       0.992341
       0.995695
       0.995864
    
    
      ucr
       0.995736
       1.000000
       0.992644
       0.996527
       0.997235
       0.995112
       0.995891
       0.992371
       0.995859
       0.996714
    
    
      ucrC
       0.992960
       0.992644
       1.000000
       0.994053
       0.991472
       0.994132
       0.993919
       0.997675
       0.994638
       0.994202
    
    
      ucrss
       0.995918
       0.996527
       0.994053
       1.000000
       0.996130
       0.996035
       0.996789
       0.993736
       0.996859
       0.997034
    
    
      ucrss_wfilter
       0.995596
       0.997235
       0.991472
       0.996130
       1.000000
       0.994251
       0.995429
       0.991241
       0.995604
       0.996436
    
    
      uc_fast
       0.994955
       0.995112
       0.994132
       0.996035
       0.994251
       1.000000
       0.996530
       0.994057
       0.996510
       0.996299
    
    
      ucr_fast
       0.995661
       0.995891
       0.993919
       0.996789
       0.995429
       0.996530
       1.000000
       0.993591
       0.996812
       0.996927
    
    
      ucrC_fast
       0.992341
       0.992371
       0.997675
       0.993736
       0.991241
       0.994057
       0.993591
       1.000000
       0.994279
       0.993998
    
    
      ucrss_fast
       0.995695
       0.995859
       0.994638
       0.996859
       0.995604
       0.996510
       0.996812
       0.994279
       1.000000
       0.996822
    
    
      ucrss_fast_wfilter
       0.995864
       0.996714
       0.994202
       0.997034
       0.996436
       0.996299
       0.996927
       0.993998
       0.996822
       1.000000



In [6]:

    
if write_xls_files: adiv_results[('whole-body', 'pd')].to_excel('tables/table2c.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'pd')]









    Out[6]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.985137
       0.957040
       0.984981
       0.984775
       0.984090
       0.985665
       0.961456
       0.982975
       0.984321
    
    
      ucr
       0.985137
       1.000000
       0.955564
       0.989950
       0.988868
       0.987748
       0.987070
       0.959910
       0.987496
       0.985933
    
    
      ucrC
       0.957040
       0.955564
       1.000000
       0.961397
       0.958381
       0.958510
       0.960640
       0.990411
       0.952869
       0.961033
    
    
      ucrss
       0.984981
       0.989950
       0.961397
       1.000000
       0.990865
       0.987546
       0.989638
       0.964492
       0.989253
       0.987182
    
    
      ucrss_wfilter
       0.984775
       0.988868
       0.958381
       0.990865
       1.000000
       0.985185
       0.988750
       0.963320
       0.987431
       0.984912
    
    
      uc_fast
       0.984090
       0.987748
       0.958510
       0.987546
       0.985185
       1.000000
       0.986204
       0.960893
       0.986490
       0.985077
    
    
      ucr_fast
       0.985665
       0.987070
       0.960640
       0.989638
       0.988750
       0.986204
       1.000000
       0.964924
       0.987977
       0.988788
    
    
      ucrC_fast
       0.961456
       0.959910
       0.990411
       0.964492
       0.963320
       0.960893
       0.964924
       1.000000
       0.957054
       0.965236
    
    
      ucrss_fast
       0.982975
       0.987496
       0.952869
       0.989253
       0.987431
       0.986490
       0.987977
       0.957054
       1.000000
       0.986223
    
    
      ucrss_fast_wfilter
       0.984321
       0.985933
       0.961033
       0.987182
       0.984912
       0.985077
       0.988788
       0.965236
       0.986223
       1.000000



In [7]:

    
if write_xls_files: adiv_results[('88-soils', 'observed_species')].to_excel('tables/table2d.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'observed_species')]









    Out[7]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.947875
       0.879873
       0.909483
       0.924238
       0.935259
       0.933534
       0.876867
       0.924938
       0.912961
    
    
      ucr
       0.947875
       1.000000
       0.904662
       0.946052
       0.947025
       0.947314
       0.952570
       0.902627
       0.938155
       0.932401
    
    
      ucrC
       0.879873
       0.904662
       1.000000
       0.925964
       0.888274
       0.882334
       0.907653
       0.972994
       0.910432
       0.895664
    
    
      ucrss
       0.909483
       0.946052
       0.925964
       1.000000
       0.931844
       0.923371
       0.934916
       0.915025
       0.930894
       0.928529
    
    
      ucrss_wfilter
       0.924238
       0.947025
       0.888274
       0.931844
       1.000000
       0.943087
       0.946177
       0.884305
       0.932178
       0.926691
    
    
      uc_fast
       0.935259
       0.947314
       0.882334
       0.923371
       0.943087
       1.000000
       0.942244
       0.882958
       0.940951
       0.939832
    
    
      ucr_fast
       0.933534
       0.952570
       0.907653
       0.934916
       0.946177
       0.942244
       1.000000
       0.908362
       0.942923
       0.931863
    
    
      ucrC_fast
       0.876867
       0.902627
       0.972994
       0.915025
       0.884305
       0.882958
       0.908362
       1.000000
       0.904095
       0.905611
    
    
      ucrss_fast
       0.924938
       0.938155
       0.910432
       0.930894
       0.932178
       0.940951
       0.942923
       0.904095
       1.000000
       0.952921
    
    
      ucrss_fast_wfilter
       0.912961
       0.932401
       0.895664
       0.928529
       0.926691
       0.939832
       0.931863
       0.905611
       0.952921
       1.000000



In [8]:

    
if write_xls_files: adiv_results[('moving-pictures', 'observed_species')].to_excel('tables/table2e.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'observed_species')]









    Out[8]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.992461
       0.983973
       0.992078
       0.992313
       0.988647
       0.990092
       0.978429
       0.989429
       0.989610
    
    
      ucr
       0.992461
       1.000000
       0.994303
       0.998253
       0.998186
       0.991920
       0.996989
       0.990556
       0.996875
       0.997030
    
    
      ucrC
       0.983973
       0.994303
       1.000000
       0.994695
       0.994748
       0.983908
       0.993300
       0.996874
       0.993777
       0.993611
    
    
      ucrss
       0.992078
       0.998253
       0.994695
       1.000000
       0.998285
       0.991935
       0.997188
       0.991311
       0.997034
       0.997209
    
    
      ucrss_wfilter
       0.992313
       0.998186
       0.994748
       0.998285
       1.000000
       0.991651
       0.997157
       0.991281
       0.996954
       0.997097
    
    
      uc_fast
       0.988647
       0.991920
       0.983908
       0.991935
       0.991651
       1.000000
       0.992937
       0.980686
       0.991777
       0.991937
    
    
      ucr_fast
       0.990092
       0.996989
       0.993300
       0.997188
       0.997157
       0.992937
       1.000000
       0.992169
       0.997906
       0.998061
    
    
      ucrC_fast
       0.978429
       0.990556
       0.996874
       0.991311
       0.991281
       0.980686
       0.992169
       1.000000
       0.992683
       0.992397
    
    
      ucrss_fast
       0.989429
       0.996875
       0.993777
       0.997034
       0.996954
       0.991777
       0.997906
       0.992683
       1.000000
       0.997988
    
    
      ucrss_fast_wfilter
       0.989610
       0.997030
       0.993611
       0.997209
       0.997097
       0.991937
       0.998061
       0.992397
       0.997988
       1.000000



In [9]:

    
if write_xls_files: adiv_results[('whole-body', 'observed_species')].to_excel('tables/table2f.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'observed_species')]









    Out[9]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
       1.000000
       0.986013
       0.970654
       0.986054
       0.986288
       0.992712
       0.987933
       0.971591
       0.988239
       0.987425
    
    
      ucr
       0.986013
       1.000000
       0.983916
       0.995433
       0.995315
       0.986973
       0.992700
       0.979658
       0.992898
       0.992813
    
    
      ucrC
       0.970654
       0.983916
       1.000000
       0.984706
       0.983561
       0.970210
       0.980584
       0.992099
       0.980124
       0.979129
    
    
      ucrss
       0.986054
       0.995433
       0.984706
       1.000000
       0.995365
       0.986947
       0.992549
       0.980568
       0.992644
       0.992455
    
    
      ucrss_wfilter
       0.986288
       0.995315
       0.983561
       0.995365
       1.000000
       0.986130
       0.992597
       0.979232
       0.991829
       0.992433
    
    
      uc_fast
       0.992712
       0.986973
       0.970210
       0.986947
       0.986130
       1.000000
       0.988851
       0.971697
       0.989586
       0.988207
    
    
      ucr_fast
       0.987933
       0.992700
       0.980584
       0.992549
       0.992597
       0.988851
       1.000000
       0.981125
       0.994334
       0.994478
    
    
      ucrC_fast
       0.971591
       0.979658
       0.992099
       0.980568
       0.979232
       0.971697
       0.981125
       1.000000
       0.981586
       0.979289
    
    
      ucrss_fast
       0.988239
       0.992898
       0.980124
       0.992644
       0.991829
       0.989586
       0.994334
       0.981586
       1.000000
       0.994629
    
    
      ucrss_fast_wfilter
       0.987425
       0.992813
       0.979129
       0.992455
       0.992433
       0.988207
       0.994478
       0.979289
       0.994629
       1.000000

Table 3: Beta diversity correlation by method



In [10]:

    
bdiv_results = {}
for metric in ['unweighted_unifrac', 'weighted_unifrac']:
    for data_set_id in data_set_ids:
        id_ = (data_set_id,metric)
        fp = '%s-otus/%s_mantel_results/mantel_results.txt' % id_
        df = pd.DataFrame.from_csv(fp, sep='\t', header=4, index_col=False)
        df = df.pivot(index='DM1', columns='DM2', values='Mantel r statistic')
        # re-name and re-order rows and columns
        df.index = [e.split('/')[1] for e in df.index]
        df.columns = [e.split('/')[1] for e in df.columns]
        df = df.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
        bdiv_results[id_] = df



In [11]:

    
if write_xls_files: bdiv_results[('88-soils', 'unweighted_unifrac')].to_excel('tables/table3a.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'unweighted_unifrac')]









    Out[11]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.93487
       0.90823
       0.94364
       0.94208
       0.93926
       0.94472
       0.90865
       0.94259
       0.94149
    
    
      ucr
      NaN
           NaN
       0.91458
       0.93982
       0.94465
       0.93425
       0.94212
       0.91776
       0.94393
       0.94881
    
    
      ucrC
      NaN
           NaN
           NaN
       0.91656
       0.90959
       0.92586
       0.91308
       0.94977
       0.91709
       0.91966
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.93954
       0.93791
       0.94464
       0.91370
       0.93830
       0.94230
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.93439
       0.94291
       0.90714
       0.94156
       0.94136
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.93829
       0.91969
       0.93888
       0.94116
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.90853
       0.94642
       0.94651
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.91736
       0.92433
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.94494
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN



In [12]:

    
if write_xls_files: bdiv_results[('moving-pictures', 'unweighted_unifrac')].to_excel('tables/table3b.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'unweighted_unifrac')]









    Out[12]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.99196
       0.97423
       0.98823
       0.98824
       0.99195
       0.99050
       0.97744
       0.99128
       0.99194
    
    
      ucr
      NaN
           NaN
       0.98180
       0.99162
       0.99146
       0.99107
       0.99236
       0.98375
       0.99256
       0.99300
    
    
      ucrC
      NaN
           NaN
           NaN
       0.98624
       0.98470
       0.97287
       0.98162
       0.99387
       0.98050
       0.98086
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.98983
       0.98847
       0.99174
       0.98729
       0.99157
       0.99134
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.98587
       0.98971
       0.98566
       0.98950
       0.99055
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.99087
       0.97569
       0.99175
       0.99148
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.98330
       0.99293
       0.99220
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.98231
       0.98313
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.99275
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN



In [13]:

    
if write_xls_files: bdiv_results[('whole-body', 'unweighted_unifrac')].to_excel('tables/table3c.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'unweighted_unifrac')]









    Out[13]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.93502
       0.89081
       0.93757
       0.93633
       0.92982
       0.92633
       0.88941
       0.93279
       0.92547
    
    
      ucr
      NaN
           NaN
       0.89946
       0.94835
       0.95012
       0.93418
       0.93139
       0.89533
       0.94144
       0.92738
    
    
      ucrC
      NaN
           NaN
           NaN
       0.90765
       0.89852
       0.87809
       0.88532
       0.95207
       0.89723
       0.87790
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.95252
       0.93798
       0.93623
       0.90499
       0.94548
       0.92768
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.93661
       0.93996
       0.89442
       0.94125
       0.93176
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.94199
       0.87167
       0.93906
       0.93754
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.88760
       0.93944
       0.94811
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.89139
       0.87890
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.93340
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN



In [14]:

    
if write_xls_files: bdiv_results[('88-soils', 'weighted_unifrac')].to_excel('tables/table3d.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'weighted_unifrac')]









    Out[14]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.89611
       0.93569
       0.95121
       0.90142
       0.92525
       0.93708
       0.92377
       0.95616
       0.90235
    
    
      ucr
      NaN
           NaN
       0.89623
       0.88943
       0.96587
       0.89103
       0.93933
       0.89456
       0.90146
       0.94684
    
    
      ucrC
      NaN
           NaN
           NaN
       0.91903
       0.91355
       0.90626
       0.92836
       0.98438
       0.93128
       0.89627
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.89965
       0.91691
       0.94744
       0.90325
       0.94914
       0.89874
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.88544
       0.93761
       0.91087
       0.89856
       0.93959
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.90934
       0.89783
       0.91872
       0.87395
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.92017
       0.95189
       0.95986
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.91809
       0.89003
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.91784
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN



In [15]:

    
if write_xls_files: bdiv_results[('moving-pictures', 'weighted_unifrac')].to_excel('tables/table3e.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'weighted_unifrac')]









    Out[15]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.97127
       0.94939
       0.96991
       0.97333
       0.97169
       0.97675
       0.94938
       0.97358
       0.96645
    
    
      ucr
      NaN
           NaN
       0.92765
       0.95164
       0.95177
       0.95722
       0.95820
       0.92801
       0.95995
       0.95402
    
    
      ucrC
      NaN
           NaN
           NaN
       0.96042
       0.94021
       0.94837
       0.93388
       0.99940
       0.96532
       0.93212
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.93761
       0.96519
       0.95474
       0.95964
       0.98007
       0.93228
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.94615
       0.96638
       0.94095
       0.95057
       0.96651
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.96951
       0.94818
       0.97086
       0.94925
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.93403
       0.96671
       0.96732
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.96474
       0.93239
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.95120
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN



In [16]:

    
if write_xls_files: bdiv_results[('whole-body', 'weighted_unifrac')].to_excel('tables/table3f.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'weighted_unifrac')]









    Out[16]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.94742
       0.89564
       0.93433
       0.94257
       0.95997
       0.93908
       0.89794
       0.90381
       0.93587
    
    
      ucr
      NaN
           NaN
       0.89958
       0.92423
       0.94983
       0.95082
       0.92024
       0.90377
       0.87088
       0.94386
    
    
      ucrC
      NaN
           NaN
           NaN
       0.88576
       0.92416
       0.90725
       0.91116
       0.99398
       0.83136
       0.93889
    
    
      ucrss
      NaN
           NaN
           NaN
           NaN
       0.94404
       0.92023
       0.91664
       0.88246
       0.91841
       0.91070
    
    
      ucrss_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
       0.93333
       0.91832
       0.92575
       0.89680
       0.93162
    
    
      uc_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.95490
       0.90857
       0.88950
       0.96583
    
    
      ucr_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.91048
       0.93554
       0.95136
    
    
      ucrC_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.82984
       0.94028
    
    
      ucrss_fast
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
       0.86558
    
    
      ucrss_fast_wfilter
      NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN
           NaN

Table 4: Taxa summary correlation by method



In [17]:

    
taxa_corr_results = {}
for data_set_id in data_set_ids:
    taxa_corr_results[data_set_id] = {}
    for level in range(2,8):
        df = pd.DataFrame.from_csv("%s-otus/taxa_correlations/level_%d.csv" % (data_set_id, level))
        taxa_corr_results[data_set_id][level] = df.copy()



In [18]:

    
if write_xls_files: taxa_corr_results['88-soils'][2].to_excel('tables/table4a.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][2]









    Out[18]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.999998
       0.983181
       0.999997
       0.999998
       0.999996
       0.999996
       0.981320
       0.999996
       0.999995
    
    
      ucr
      NaN
            NaN
       0.983160
       0.999999
       0.999999
       0.999996
       0.999997
       0.981302
       0.999997
       0.999996
    
    
      ucrC
      NaN
            NaN
            NaN
       0.983147
       0.983165
       0.983156
       0.983157
       0.999455
       0.983152
       0.983168
    
    
      ucrss
      NaN
            NaN
            NaN
            NaN
       0.999998
       0.999995
       0.999996
       0.981290
       0.999996
       0.999995
    
    
      ucrss_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
       0.999995
       0.999996
       0.981309
       0.999996
       0.999995
    
    
      uc_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999998
       0.981299
       0.999999
       0.999998
    
    
      ucr_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.981299
       1.000000
       1.000000
    
    
      ucrC_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.981294
       0.981310
    
    
      ucrss_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999999
    
    
      ucrss_fast_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN



In [19]:

    
if write_xls_files: taxa_corr_results['88-soils'][7].to_excel('tables/table4b.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][7]









    Out[19]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.939448
       0.849721
       0.939429
       0.939433
       0.999694
       0.939752
       0.839735
       0.939631
       0.939830
    
    
      ucr
      NaN
            NaN
       0.820697
       0.999992
       0.999990
       0.939538
       0.998175
       0.923148
       0.998058
       0.998126
    
    
      ucrC
      NaN
            NaN
            NaN
       0.820677
       0.820729
       0.849934
       0.819635
       0.818193
       0.819658
       0.820185
    
    
      ucrss
      NaN
            NaN
            NaN
            NaN
       0.999982
       0.939520
       0.998170
       0.923148
       0.998053
       0.998120
    
    
      ucrss_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
       0.939526
       0.998167
       0.923154
       0.998050
       0.998120
    
    
      uc_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.939894
       0.840394
       0.939776
       0.939982
    
    
      ucr_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.920705
       0.999882
       0.999942
    
    
      ucrC_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.920747
       0.920906
    
    
      ucrss_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999826
    
    
      ucrss_fast_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN



In [20]:

    
if write_xls_files: taxa_corr_results['moving-pictures'][2].to_excel('tables/table4c.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][2]









    Out[20]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.999997
       0.997003
       0.999992
       0.999659
       0.999997
       0.999996
       0.996827
       0.999987
       0.998449
    
    
      ucr
      NaN
            NaN
       0.997004
       0.999998
       0.999665
       1.000000
       1.000000
       0.996831
       0.999993
       0.998460
    
    
      ucrC
      NaN
            NaN
            NaN
       0.997000
       0.997387
       0.996999
       0.997000
       0.999844
       0.997001
       0.997601
    
    
      ucrss
      NaN
            NaN
            NaN
            NaN
       0.999665
       0.999998
       0.999999
       0.996828
       0.999995
       0.998460
    
    
      ucrss_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
       0.999664
       0.999665
       0.997216
       0.999660
       0.999489
    
    
      uc_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       1.000000
       0.996824
       0.999993
       0.998460
    
    
      ucr_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.996826
       0.999994
       0.998461
    
    
      ucrC_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.996830
       0.997426
    
    
      ucrss_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.998457
    
    
      ucrss_fast_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN



In [21]:

    
if write_xls_files: taxa_corr_results['moving-pictures'][7].to_excel('tables/table4d.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][7]









    Out[21]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.96386
       0.928748
       0.963730
       0.963212
       0.999477
       0.923112
       0.881559
       0.923047
       0.919501
    
    
      ucr
      NaN
           NaN
       0.963125
       0.999977
       0.999305
       0.966905
       0.954344
       0.922883
       0.954292
       0.950571
    
    
      ucrC
      NaN
           NaN
            NaN
       0.963242
       0.963341
       0.933715
       0.925086
       0.916942
       0.925425
       0.925011
    
    
      ucrss
      NaN
           NaN
            NaN
            NaN
       0.999292
       0.966773
       0.954287
       0.923035
       0.954257
       0.950519
    
    
      ucrss_wfilter
      NaN
           NaN
            NaN
            NaN
            NaN
       0.966236
       0.953421
       0.923123
       0.953378
       0.952243
    
    
      uc_fast
      NaN
           NaN
            NaN
            NaN
            NaN
            NaN
       0.927224
       0.886843
       0.927174
       0.923658
    
    
      ucr_fast
      NaN
           NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.884533
       0.999968
       0.996988
    
    
      ucrC_fast
      NaN
           NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.884863
       0.884356
    
    
      ucrss_fast
      NaN
           NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.996978
    
    
      ucrss_fast_wfilter
      NaN
           NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN



In [22]:

    
if write_xls_files: taxa_corr_results['whole-body'][2].to_excel('tables/table4e.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][2]









    Out[22]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.999992
       0.998880
       0.999992
       0.999990
       0.999989
       0.999989
       0.997646
       0.999964
       0.999987
    
    
      ucr
      NaN
            NaN
       0.998881
       0.999999
       0.999999
       0.999991
       0.999995
       0.997640
       0.999969
       0.999994
    
    
      ucrC
      NaN
            NaN
            NaN
       0.998878
       0.998882
       0.998895
       0.998887
       0.999008
       0.998880
       0.998878
    
    
      ucrss
      NaN
            NaN
            NaN
            NaN
       0.999998
       0.999991
       0.999995
       0.997638
       0.999970
       0.999993
    
    
      ucrss_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
       0.999991
       0.999995
       0.997641
       0.999967
       0.999995
    
    
      uc_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999997
       0.997654
       0.999972
       0.999993
    
    
      ucr_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.997648
       0.999976
       0.999996
    
    
      ucrC_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.997641
       0.997641
    
    
      ucrss_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999968
    
    
      ucrss_fast_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN



In [23]:

    
if write_xls_files: taxa_corr_results['whole-body'][7].to_excel('tables/table4f.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][7]









    Out[23]:






  
    
      
      uc
      ucr
      ucrC
      ucrss
      ucrss_wfilter
      uc_fast
      ucr_fast
      ucrC_fast
      ucrss_fast
      ucrss_fast_wfilter
    
  
  
    
      uc
      NaN
       0.958787
       0.899610
       0.958781
       0.958771
       0.999723
       0.912856
       0.878695
       0.913000
       0.912872
    
    
      ucr
      NaN
            NaN
       0.917773
       0.999997
       0.999996
       0.956702
       0.966950
       0.871130
       0.966822
       0.966951
    
    
      ucrC
      NaN
            NaN
            NaN
       0.917759
       0.917776
       0.896388
       0.892513
       0.934549
       0.892270
       0.892576
    
    
      ucrss
      NaN
            NaN
            NaN
            NaN
       0.999993
       0.956695
       0.966942
       0.871108
       0.966815
       0.966944
    
    
      ucrss_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
       0.956685
       0.966952
       0.871175
       0.966821
       0.966957
    
    
      uc_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.911841
       0.875543
       0.911992
       0.911853
    
    
      ucr_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.854585
       0.999900
       0.999989
    
    
      ucrC_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.854332
       0.854682
    
    
      ucrss_fast
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
       0.999887
    
    
      ucrss_fast_wfilter
      NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN
            NaN

Table 5: Run time by method



In [24]:

    
from datetime import datetime
from calendar import month_abbr

month_lookup = {v: k for k,v in enumerate(month_abbr)}



In [25]:

    
log_fps = glob('*/*/log*txt')

results = []
for log_fp in log_fps:
    fp_fields = log_fp.split('/')
    data_set_id = fp_fields[0].strip('-otus')
    run_id = fp_fields[1]
    d = !grep "^Logging" $log_fp
    start_fields = d[0].split()
    start_hour, start_minute, start_second = map(int, start_fields[3].split(':'))
    start_day, start_month, start_year = map(int, [start_fields[5], month_lookup[start_fields[6]], start_fields[7]])
    end_fields = d[1].split()
    end_hour, end_minute, end_second = map(int, end_fields[3].split(':'))
    end_day, end_month, end_year = map(int, [end_fields[5], month_lookup[end_fields[6]], end_fields[7]])
    start_datetime = datetime(start_year, start_month, start_day, start_hour, start_minute, start_second)
    end_datetime = datetime(end_year, end_month, end_day, end_hour, end_minute, end_second)
    runtime = (end_datetime - start_datetime).seconds
    results.append((run_id, data_set_id, runtime))

df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] <= 10]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')
df = df.reindex_axis(order, axis=0)



In [26]:

    
if write_xls_files: df.to_excel('tables/table5.xlsx', na_rep='NA', float_format="%1.3f")
df









    Out[26]:






  
    
      data set id
      88-soil
      moving-picture
      whole-body
    
  
  
    
      uc
       1220
       27748
       1095
    
    
      ucr
       1358
       46576
       1082
    
    
      ucrC
        226
       28572
        388
    
    
      ucrss
       1493
       47207
       1212
    
    
      ucrss_wfilter
       1885
       76061
       2088
    
    
      uc_fast
        914
       23510
        489
    
    
      ucr_fast
       1052
       19371
        621
    
    
      ucrC_fast
         44
        2428
         68
    
    
      ucrss_fast
       1021
       23710
        707
    
    
      ucrss_fast_wfilter
       1525
       52811
       1661

Table 6: Subsampled OTU parameter variations



In [27]:

    
df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] == 29]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')



In [28]:

    
if write_xls_files: df.to_excel('tables/table6.xlsx', na_rep='NA', float_format="%1.3f")
df









    Out[28]:






  
    
      data set id
      moving-picture
    
    
      abbreviation
      
    
  
  
    
      ucr_fast_O29_r82
       21737
    
    
      ucr_fast_O29_r97
       16241
    
    
      ucrss_fast_O29_r82
       17812
    
    
      ucrss_fast_O29_r97
       16169
    
    
      ucrss_fast_O29_s1
       14911



In [29]:

    
df['moving-picture']['ucr_fast_O29_r82'] - df['moving-picture']['ucrss_fast_O29_r82']









    Out[29]:





3925



In [30]:

    
df['moving-picture']['ucr_fast_O29_r97'] - df['moving-picture']['ucrss_fast_O29_r97']









    Out[30]:





72

Table 7: Novel OTUs by biome



In [31]:

    
novel_by_biome = pd.io.parsers.read_csv('raw-tables/fraction-novel-by-env-biome.tsv', sep='\t', index_col=0)
if write_xls_files: novel_by_biome.to_excel('tables/table7.xlsx', na_rep='NA', float_format="%1.3f")
novel_by_biome









    Out[31]:






  
    
      
      Average de novo OTUs (10K sequences per sample)
      SD de novo OTUs (10K sequences per sample)
      Average Reference OTUs (10k sequences per sample)
      SD Reference OTUs (10k sequences per sample)
      % novel diversity (10k seqs per sample)
      % error novel diversity (10K seqs per sample)
      number of samples
    
    
      EnvironmentalBiome
      
      
      
      
      
      
      
    
  
  
    
      mangrove biome
       2169
       1159
        354
        73
       0.860
       0.460
          7
    
    
      tropical humid forests
       2398
        260
        397
        35
       0.858
       0.094
         26
    
    
      tundra biome
       1771
        403
        312
       117
       0.850
       0.201
        110
    
    
      deserts and xeric shrubland biome
       3917
        127
        707
        15
       0.847
       0.028
          7
    
    
      taiga
       2598
        102
        505
        35
       0.837
       0.035
          4
    
    
      marine biome
       2040
       1048
        484
       410
       0.808
       0.446
        890
    
    
      aquatic biome
        714
        299
        177
       199
       0.801
       0.403
        762
    
    
      freshwater biome
        768
        541
        194
       120
       0.798
       0.576
        375
    
    
      warm deserts and semideserts
       2386
        473
        607
       147
       0.797
       0.166
         97
    
    
      tropical and subtropical moist broadleaf forest biome
       3072
        125
        846
        18
       0.784
       0.032
          2
    
    
      temperate needle-leaf forests or woodlands
       2836
        159
        785
       132
       0.783
       0.057
         21
    
    
      polar biome
       1721
        886
        483
       218
       0.781
       0.414
        277
    
    
      tropical and subtropical coniferous forest biome
       1993
        256
        579
        94
       0.775
       0.106
          3
    
    
      mixed island systems
       1552
        618
        511
       203
       0.752
       0.315
        124
    
    
      marginal sea
       1795
        325
        611
       225
       0.746
       0.164
          7
    
    
      temperate coniferous forest biome
       2504
       1206
        885
       201
       0.739
       0.361
         19
    
    
      mediterranean forests, woodlands, and shrub biome
        695
        361
        275
       195
       0.717
       0.424
        371
    
    
      large river biome
       1844
        629
        743
       369
       0.713
       0.282
          5
    
    
      terrestrial biome
       2714
        222
       1138
       163
       0.705
       0.072
        627
    
    
      nest of bird
        821
        276
        355
       138
       0.698
       0.262
        313
    
    
      Temperate broadleaf and mixed forest biome
       1910
        491
        879
       235
       0.685
       0.195
         14
    
    
      temperate grasslands
       2745
        290
       1315
       164
       0.676
       0.082
        696
    
    
      animal-associated habitat
        758
        329
        376
       240
       0.668
       0.359
       1036
    
    
      mammalia-associated habitat
        973
        357
        583
       222
       0.625
       0.270
       1918
    
    
      Cold-winter (continental) deserts and semideserts
        847
        210
        551
       215
       0.606
       0.215
        102
    
    
      Temperate grasslands, savannas, and shrubland biome
       1688
        272
       1497
       275
       0.530
       0.121
         85
    
    
      human-associated habitat
        292
        242
        590
       366
       0.331
       0.498
       1597

Table 8: Differential OTU representation



In [32]:

    
gs_fp_sets = [glob('%s-otus/*/group_significance*txt' % id_) for id_ in data_set_ids]
gs_results = {}
n = 25

for data_set_id, gs_fp_set in zip(data_set_ids, gs_fp_sets):
    df = None
    for fp in gs_fp_set:
        run_id = fp.split('/')[1]
        id_ = (data_set_id, run_id)
        new_df = pd.DataFrame.from_csv(fp, sep='\t')
        gs_results[id_] = new_df[['taxonomy', 'Test-Statistic']][:n]



In [33]:

    
if write_xls_files: gs_results[('88-soils', 'ucrss')][:10].to_excel('tables/table8a.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('88-soils', 'ucrss')][:10]









    Out[33]:






  
    
      
      taxonomy
      Test-Statistic
    
    
      OTU
      
      
    
  
  
    
      113212
       k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
       55.858724
    
    
      1123837
       k__Bacteria;p__Actinobacteria;c__Rubrobacteria...
       50.432793
    
    
      New.ReferenceOTU22
       k__Bacteria;p__Actinobacteria;c__Actinobacteri...
       49.171544
    
    
      252012
       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
       48.649686
    
    
      843189
       k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
       47.005758
    
    
      1127423
       k__Bacteria;p__Acidobacteria;c__Acidobacteriia...
       43.869989
    
    
      1129210
       k__Bacteria;p__Acidobacteria;c__Acidobacteriia...
       43.804418
    
    
      831520
       k__Bacteria;p__Actinobacteria;c__Rubrobacteria...
       43.624935
    
    
      1139779
       k__Bacteria;p__Proteobacteria;c__Alphaproteoba...
       41.862548
    
    
      804187
       k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
       41.151350



In [34]:

    
gs_results[('88-soils', 'ucrss')]['taxonomy']['New.ReferenceOTU22']









    Out[34]:





'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__;g__;s__'



In [35]:

    
if write_xls_files: gs_results[('moving-pictures', 'ucrss')][:10].to_excel('tables/table8b.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('moving-pictures', 'ucrss')][:10]









    Out[35]:






  
    
      
      taxonomy
      Test-Statistic
    
    
      OTU
      
      
    
  
  
    
      368134
       k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
       1599.696001
    
    
      3154070
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       1625.703076
    
    
      1000986
       k__Bacteria;p__Actinobacteria;c__Actinobacteri...
       1630.009468
    
    
      1992
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       1728.164420
    
    
      4304475
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       1545.444829
    
    
      191238
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       1546.435699
    
    
      187665
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       1474.529300
    
    
      4396297
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       1585.014965
    
    
      3903651
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       1670.188332
    
    
      3472078
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       1783.487973



In [36]:

    
if write_xls_files: gs_results[('whole-body', 'ucrss')][:10].to_excel('tables/table8c.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('whole-body', 'ucrss')][:10]









    Out[36]:






  
    
      
      taxonomy
      Test-Statistic
    
    
      OTU
      
      
    
  
  
    
      4326219
       k__Bacteria;p__Proteobacteria;c__Epsilonproteo...
       363.881085
    
    
      New.CleanUp.ReferenceOTU222
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       358.019614
    
    
      4325533
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       349.852497
    
    
      New.CleanUp.ReferenceOTU17550
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       337.655669
    
    
      316732
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       337.309334
    
    
      4346374
       k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
       331.432928
    
    
      4458959
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       329.771577
    
    
      3866487
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       323.487958
    
    
      4391641
       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
       311.999639
    
    
      175751
       k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
       305.530588



In [37]:

    
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU222']









    Out[37]:





'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica'



In [38]:

    
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU17550']









    Out[38]:





'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__parvula'

Figure 2: De novo OTU picking run time versus number of sequences



In [39]:

    
from pylab import savefig

emp_time_dir = 'emp-analysis/uc_time/'
emp_time_fps = glob('%s/*txt' % emp_time_dir)
mems = defaultdict(list)
times = defaultdict(list)
total_seqs = 5594412
for fp in emp_time_fps:
    fn = splitext(split(fp)[1])[0]
    _, subsample_size, iteration = fn.split('_')
    mem, time = open(fp).read().strip().split()
    subsample_seqs = float(subsample_size) * total_seqs
    mems[subsample_seqs].append(float(mem) / 1000000)
    times[subsample_seqs].append(float(time) / 60 / 60 / 24)



In [40]:

    
time_data = times.items()
time_data.sort()
time_data = time_data[9:]
num_seqs, time_distributions = zip(*time_data)
ax = boxplots(time_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])

slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in time_distributions])
label = ["Runtime (days) = \n  (%1.2e x Sequence Count) + %1.2f" % (slope, intercept),
         "\nr-squared: %1.2f\np=%1.2e" % (r_value, p_value)]

ax.text(0.15,0.70,"\n".join(label))
if write_xls_files: savefig('tables/Figure2.pdf')



In [41]:

    
def time_in_days(num_sequences):
    return (0.02 * num_sequences + -21618.68) / 60 / 60 / 24



In [42]:

    
time_in_days(total_seqs)









    Out[42]:





1.044786574074074



In [43]:

    
time_in_days(661202560)









    Out[43]:





152.80593194444444

Miscellaneous notes

Confirming that the closed-reference step of ucr and ucrss have equal numbers of sequences failing to hit the reference.

$ wc -l *-otus/ucrss/step1_otus/*_failures.txt *-otus/ucr/uclust_ref_picked_otus/*_failures.txt
    79662 88-soils-otus/ucrss/step1_otus/study_103_split_library_seqs_failures.txt
    79662 88-soils-otus/ucr/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
  2805981 moving-pictures-otus/ucrss/step1_otus/study_550_split_library_seqs_failures.txt
  2805981 moving-pictures-otus/ucr/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
    58736 whole-body-otus/ucrss/step1_otus/study_449_split_library_seqs_failures.txt
    58736 whole-body-otus/ucr/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt

$ wc -l *-otus/ucrss_fast/step1_otus/*_failures.txt *-otus/ucr_fast/uclust_ref_picked_otus/*_failures.txt
    84565 88-soils-otus/ucrss_fast/step1_otus/study_103_split_library_seqs_failures.txt
    84565 88-soils-otus/ucr_fast/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
  3432893 moving-pictures-otus/ucrss_fast/step1_otus/study_550_split_library_seqs_failures.txt
  3432893 moving-pictures-otus/ucr_fast/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
    73254 whole-body-otus/ucrss_fast/step1_otus/study_449_split_library_seqs_failures.txt
    73254 whole-body-otus/ucr_fast/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt

Computing the number of sequences being clusstered in each data set.

count_seqs.py -i $MOVING_PICTURES_SEQS,$SOILS_SEQS,$WHOLE_BODY_SEQS

151387  : /home/caporaso/analysis/88-soils/study_103_split_library_seqs.fna (Sequence lengths (mean +/- std): 230.7818 +/- 11.4761)
792831  : /home/caporaso/analysis/whole-body/study_449_split_library_seqs.fna (Sequence lengths (mean +/- std): 228.5124 +/- 16.0318)
68666081  : /home/caporaso/analysis/moving-pictures/study_550_split_library_seqs.fna (Sequence lengths (mean +/- std): 123.2359 +/- 17.4283)



In [44]:

    
mem_data = mems.items()
mem_data.sort()
mem_data = mem_data[10:]
num_seqs, mem_distributions = zip(*mem_data)
ax = boxplots(mem_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])

slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in mem_distributions])
print "time = %1.2f * subsample_size + %1.2f" % (slope, intercept)
print "r-squared: %1.2f, p=%f" % (r_value, p_value)









    



time = 0.00 * subsample_size + -1.30
r-squared: 1.00, p=0.000000



In [44]:

	title	command	max_accepts	max_rejects	stepwords	wordlength	prefilter_percent_id	min_otu_size	speed_mode	processors	reference_percent_id	subsample_fraction
abbreviation
uc	De novo	pick_de_novo_otus.py	20	500	20	12	NaN	NaN	slow	1	0.97	NaN
ucr	Legacy open reference	pick_de_novo_otus.py	20	500	20	12	NaN	NaN	slow	10	0.97	NaN
ucrC	Closed reference	pick_closed_reference_otus.py	20	500	20	12	NaN	NaN	slow	10	0.97	NaN
ucrss	Subsampled open reference	pick_open_reference_otus.py	20	500	20	12	0.0	1	slow	10	0.97	0.001
ucrss_wfilter	Subsampled open reference, filtered	pick_open_reference_otus.py	20	500	20	12	0.6	1	slow	10	0.97	0.001
uc_fast	De novo, fast settings	pick_de_novo_otus.py	1	8	8	8	NaN	NaN	fast	1	0.97	NaN
ucr_fast	Legacy open reference, fast settings	pick_de_novo_otus.py	1	8	8	8	NaN	NaN	fast	10	0.97	NaN
ucrC_fast	Closed reference, fast settings	pick_closed_reference_otus.py	1	8	8	8	NaN	NaN	fast	10	0.97	NaN
ucrss_fast	Subsampled open reference, fast settings	pick_open_reference_otus.py	1	8	8	8	0.0	1	fast	10	0.97	0.001
ucrss_fast_wfilter	Subsampled open reference, filtered, fast sett...	pick_open_reference_otus.py	1	8	8	8	0.6	1	fast	10	0.97	0.001
ucr_fast_O29_r82	Legacy open reference, fast settings, 82% refe...	pick_de_novo_otus.py	1	8	8	8	0.0	1	fast	29	0.82	0.001
ucr_fast_O29_r97	Legacy open reference, fast settings, 29 proce...	pick_de_novo_otus.py	1	8	8	8	0.0	1	fast	29	0.97	0.001
ucrss_fast_O29_r82	Subsampled open reference, fast settings, 82% ...	pick_open_reference_otus.py	1	8	8	8	0.0	1	fast	29	0.82	0.001
ucrss_fast_O29_r97	Subsampled open reference, fast settings, 29 p...	pick_open_reference_otus.py	1	8	8	8	0.0	1	fast	29	0.97	0.001
ucrss_fast_O29_s1	Subsampled open reference, fast settings, 29 p...	pick_open_reference_otus.py	1	8	8	8	0.0	1	fast	29	0.97	0.100

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.950810	0.932749	0.933510	0.952565	0.955606	0.936157	0.926662	0.947673	0.946606
ucr	0.950810	1.000000	0.902187	0.930701	0.930193	0.945996	0.940277	0.903199	0.952477	0.943609
ucrC	0.932749	0.902187	1.000000	0.893631	0.908745	0.905111	0.914053	0.977595	0.902405	0.910766
ucrss	0.933510	0.930701	0.893631	1.000000	0.929412	0.944127	0.935030	0.893611	0.947867	0.948666
ucrss_wfilter	0.952565	0.930193	0.908745	0.929412	1.000000	0.951712	0.933274	0.902546	0.930598	0.943046
uc_fast	0.955606	0.945996	0.905111	0.944127	0.951712	1.000000	0.952704	0.898282	0.956454	0.960493
ucr_fast	0.936157	0.940277	0.914053	0.935030	0.933274	0.952704	1.000000	0.914263	0.950392	0.952351
ucrC_fast	0.926662	0.903199	0.977595	0.893611	0.902546	0.898282	0.914263	1.000000	0.902216	0.902507
ucrss_fast	0.947673	0.952477	0.902405	0.947867	0.930598	0.956454	0.950392	0.902216	1.000000	0.962338
ucrss_fast_wfilter	0.946606	0.943609	0.910766	0.948666	0.943046	0.960493	0.952351	0.902507	0.962338	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.995736	0.992960	0.995918	0.995596	0.994955	0.995661	0.992341	0.995695	0.995864
ucr	0.995736	1.000000	0.992644	0.996527	0.997235	0.995112	0.995891	0.992371	0.995859	0.996714
ucrC	0.992960	0.992644	1.000000	0.994053	0.991472	0.994132	0.993919	0.997675	0.994638	0.994202
ucrss	0.995918	0.996527	0.994053	1.000000	0.996130	0.996035	0.996789	0.993736	0.996859	0.997034
ucrss_wfilter	0.995596	0.997235	0.991472	0.996130	1.000000	0.994251	0.995429	0.991241	0.995604	0.996436
uc_fast	0.994955	0.995112	0.994132	0.996035	0.994251	1.000000	0.996530	0.994057	0.996510	0.996299
ucr_fast	0.995661	0.995891	0.993919	0.996789	0.995429	0.996530	1.000000	0.993591	0.996812	0.996927
ucrC_fast	0.992341	0.992371	0.997675	0.993736	0.991241	0.994057	0.993591	1.000000	0.994279	0.993998
ucrss_fast	0.995695	0.995859	0.994638	0.996859	0.995604	0.996510	0.996812	0.994279	1.000000	0.996822
ucrss_fast_wfilter	0.995864	0.996714	0.994202	0.997034	0.996436	0.996299	0.996927	0.993998	0.996822	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.985137	0.957040	0.984981	0.984775	0.984090	0.985665	0.961456	0.982975	0.984321
ucr	0.985137	1.000000	0.955564	0.989950	0.988868	0.987748	0.987070	0.959910	0.987496	0.985933
ucrC	0.957040	0.955564	1.000000	0.961397	0.958381	0.958510	0.960640	0.990411	0.952869	0.961033
ucrss	0.984981	0.989950	0.961397	1.000000	0.990865	0.987546	0.989638	0.964492	0.989253	0.987182
ucrss_wfilter	0.984775	0.988868	0.958381	0.990865	1.000000	0.985185	0.988750	0.963320	0.987431	0.984912
uc_fast	0.984090	0.987748	0.958510	0.987546	0.985185	1.000000	0.986204	0.960893	0.986490	0.985077
ucr_fast	0.985665	0.987070	0.960640	0.989638	0.988750	0.986204	1.000000	0.964924	0.987977	0.988788
ucrC_fast	0.961456	0.959910	0.990411	0.964492	0.963320	0.960893	0.964924	1.000000	0.957054	0.965236
ucrss_fast	0.982975	0.987496	0.952869	0.989253	0.987431	0.986490	0.987977	0.957054	1.000000	0.986223
ucrss_fast_wfilter	0.984321	0.985933	0.961033	0.987182	0.984912	0.985077	0.988788	0.965236	0.986223	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.947875	0.879873	0.909483	0.924238	0.935259	0.933534	0.876867	0.924938	0.912961
ucr	0.947875	1.000000	0.904662	0.946052	0.947025	0.947314	0.952570	0.902627	0.938155	0.932401
ucrC	0.879873	0.904662	1.000000	0.925964	0.888274	0.882334	0.907653	0.972994	0.910432	0.895664
ucrss	0.909483	0.946052	0.925964	1.000000	0.931844	0.923371	0.934916	0.915025	0.930894	0.928529
ucrss_wfilter	0.924238	0.947025	0.888274	0.931844	1.000000	0.943087	0.946177	0.884305	0.932178	0.926691
uc_fast	0.935259	0.947314	0.882334	0.923371	0.943087	1.000000	0.942244	0.882958	0.940951	0.939832
ucr_fast	0.933534	0.952570	0.907653	0.934916	0.946177	0.942244	1.000000	0.908362	0.942923	0.931863
ucrC_fast	0.876867	0.902627	0.972994	0.915025	0.884305	0.882958	0.908362	1.000000	0.904095	0.905611
ucrss_fast	0.924938	0.938155	0.910432	0.930894	0.932178	0.940951	0.942923	0.904095	1.000000	0.952921
ucrss_fast_wfilter	0.912961	0.932401	0.895664	0.928529	0.926691	0.939832	0.931863	0.905611	0.952921	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.992461	0.983973	0.992078	0.992313	0.988647	0.990092	0.978429	0.989429	0.989610
ucr	0.992461	1.000000	0.994303	0.998253	0.998186	0.991920	0.996989	0.990556	0.996875	0.997030
ucrC	0.983973	0.994303	1.000000	0.994695	0.994748	0.983908	0.993300	0.996874	0.993777	0.993611
ucrss	0.992078	0.998253	0.994695	1.000000	0.998285	0.991935	0.997188	0.991311	0.997034	0.997209
ucrss_wfilter	0.992313	0.998186	0.994748	0.998285	1.000000	0.991651	0.997157	0.991281	0.996954	0.997097
uc_fast	0.988647	0.991920	0.983908	0.991935	0.991651	1.000000	0.992937	0.980686	0.991777	0.991937
ucr_fast	0.990092	0.996989	0.993300	0.997188	0.997157	0.992937	1.000000	0.992169	0.997906	0.998061
ucrC_fast	0.978429	0.990556	0.996874	0.991311	0.991281	0.980686	0.992169	1.000000	0.992683	0.992397
ucrss_fast	0.989429	0.996875	0.993777	0.997034	0.996954	0.991777	0.997906	0.992683	1.000000	0.997988
ucrss_fast_wfilter	0.989610	0.997030	0.993611	0.997209	0.997097	0.991937	0.998061	0.992397	0.997988	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	1.000000	0.986013	0.970654	0.986054	0.986288	0.992712	0.987933	0.971591	0.988239	0.987425
ucr	0.986013	1.000000	0.983916	0.995433	0.995315	0.986973	0.992700	0.979658	0.992898	0.992813
ucrC	0.970654	0.983916	1.000000	0.984706	0.983561	0.970210	0.980584	0.992099	0.980124	0.979129
ucrss	0.986054	0.995433	0.984706	1.000000	0.995365	0.986947	0.992549	0.980568	0.992644	0.992455
ucrss_wfilter	0.986288	0.995315	0.983561	0.995365	1.000000	0.986130	0.992597	0.979232	0.991829	0.992433
uc_fast	0.992712	0.986973	0.970210	0.986947	0.986130	1.000000	0.988851	0.971697	0.989586	0.988207
ucr_fast	0.987933	0.992700	0.980584	0.992549	0.992597	0.988851	1.000000	0.981125	0.994334	0.994478
ucrC_fast	0.971591	0.979658	0.992099	0.980568	0.979232	0.971697	0.981125	1.000000	0.981586	0.979289
ucrss_fast	0.988239	0.992898	0.980124	0.992644	0.991829	0.989586	0.994334	0.981586	1.000000	0.994629
ucrss_fast_wfilter	0.987425	0.992813	0.979129	0.992455	0.992433	0.988207	0.994478	0.979289	0.994629	1.000000

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.93487	0.90823	0.94364	0.94208	0.93926	0.94472	0.90865	0.94259	0.94149
ucr	NaN	NaN	0.91458	0.93982	0.94465	0.93425	0.94212	0.91776	0.94393	0.94881
ucrC	NaN	NaN	NaN	0.91656	0.90959	0.92586	0.91308	0.94977	0.91709	0.91966
ucrss	NaN	NaN	NaN	NaN	0.93954	0.93791	0.94464	0.91370	0.93830	0.94230
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.93439	0.94291	0.90714	0.94156	0.94136
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.93829	0.91969	0.93888	0.94116
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.90853	0.94642	0.94651
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.91736	0.92433
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.94494
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.99196	0.97423	0.98823	0.98824	0.99195	0.99050	0.97744	0.99128	0.99194
ucr	NaN	NaN	0.98180	0.99162	0.99146	0.99107	0.99236	0.98375	0.99256	0.99300
ucrC	NaN	NaN	NaN	0.98624	0.98470	0.97287	0.98162	0.99387	0.98050	0.98086
ucrss	NaN	NaN	NaN	NaN	0.98983	0.98847	0.99174	0.98729	0.99157	0.99134
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.98587	0.98971	0.98566	0.98950	0.99055
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.99087	0.97569	0.99175	0.99148
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.98330	0.99293	0.99220
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.98231	0.98313
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.99275
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.93502	0.89081	0.93757	0.93633	0.92982	0.92633	0.88941	0.93279	0.92547
ucr	NaN	NaN	0.89946	0.94835	0.95012	0.93418	0.93139	0.89533	0.94144	0.92738
ucrC	NaN	NaN	NaN	0.90765	0.89852	0.87809	0.88532	0.95207	0.89723	0.87790
ucrss	NaN	NaN	NaN	NaN	0.95252	0.93798	0.93623	0.90499	0.94548	0.92768
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.93661	0.93996	0.89442	0.94125	0.93176
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.94199	0.87167	0.93906	0.93754
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.88760	0.93944	0.94811
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.89139	0.87890
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.93340
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.89611	0.93569	0.95121	0.90142	0.92525	0.93708	0.92377	0.95616	0.90235
ucr	NaN	NaN	0.89623	0.88943	0.96587	0.89103	0.93933	0.89456	0.90146	0.94684
ucrC	NaN	NaN	NaN	0.91903	0.91355	0.90626	0.92836	0.98438	0.93128	0.89627
ucrss	NaN	NaN	NaN	NaN	0.89965	0.91691	0.94744	0.90325	0.94914	0.89874
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.88544	0.93761	0.91087	0.89856	0.93959
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.90934	0.89783	0.91872	0.87395
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.92017	0.95189	0.95986
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.91809	0.89003
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.91784
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.97127	0.94939	0.96991	0.97333	0.97169	0.97675	0.94938	0.97358	0.96645
ucr	NaN	NaN	0.92765	0.95164	0.95177	0.95722	0.95820	0.92801	0.95995	0.95402
ucrC	NaN	NaN	NaN	0.96042	0.94021	0.94837	0.93388	0.99940	0.96532	0.93212
ucrss	NaN	NaN	NaN	NaN	0.93761	0.96519	0.95474	0.95964	0.98007	0.93228
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.94615	0.96638	0.94095	0.95057	0.96651
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.96951	0.94818	0.97086	0.94925
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.93403	0.96671	0.96732
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.96474	0.93239
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.95120
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.94742	0.89564	0.93433	0.94257	0.95997	0.93908	0.89794	0.90381	0.93587
ucr	NaN	NaN	0.89958	0.92423	0.94983	0.95082	0.92024	0.90377	0.87088	0.94386
ucrC	NaN	NaN	NaN	0.88576	0.92416	0.90725	0.91116	0.99398	0.83136	0.93889
ucrss	NaN	NaN	NaN	NaN	0.94404	0.92023	0.91664	0.88246	0.91841	0.91070
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.93333	0.91832	0.92575	0.89680	0.93162
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.95490	0.90857	0.88950	0.96583
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.91048	0.93554	0.95136
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.82984	0.94028
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.86558
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.999998	0.983181	0.999997	0.999998	0.999996	0.999996	0.981320	0.999996	0.999995
ucr	NaN	NaN	0.983160	0.999999	0.999999	0.999996	0.999997	0.981302	0.999997	0.999996
ucrC	NaN	NaN	NaN	0.983147	0.983165	0.983156	0.983157	0.999455	0.983152	0.983168
ucrss	NaN	NaN	NaN	NaN	0.999998	0.999995	0.999996	0.981290	0.999996	0.999995
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.999995	0.999996	0.981309	0.999996	0.999995
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.999998	0.981299	0.999999	0.999998
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.981299	1.000000	1.000000
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.981294	0.981310
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.999999
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.939448	0.849721	0.939429	0.939433	0.999694	0.939752	0.839735	0.939631	0.939830
ucr	NaN	NaN	0.820697	0.999992	0.999990	0.939538	0.998175	0.923148	0.998058	0.998126
ucrC	NaN	NaN	NaN	0.820677	0.820729	0.849934	0.819635	0.818193	0.819658	0.820185
ucrss	NaN	NaN	NaN	NaN	0.999982	0.939520	0.998170	0.923148	0.998053	0.998120
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.939526	0.998167	0.923154	0.998050	0.998120
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.939894	0.840394	0.939776	0.939982
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.920705	0.999882	0.999942
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.920747	0.920906
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.999826
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.999997	0.997003	0.999992	0.999659	0.999997	0.999996	0.996827	0.999987	0.998449
ucr	NaN	NaN	0.997004	0.999998	0.999665	1.000000	1.000000	0.996831	0.999993	0.998460
ucrC	NaN	NaN	NaN	0.997000	0.997387	0.996999	0.997000	0.999844	0.997001	0.997601
ucrss	NaN	NaN	NaN	NaN	0.999665	0.999998	0.999999	0.996828	0.999995	0.998460
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.999664	0.999665	0.997216	0.999660	0.999489
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	0.996824	0.999993	0.998460
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.996826	0.999994	0.998461
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.996830	0.997426
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.998457
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.96386	0.928748	0.963730	0.963212	0.999477	0.923112	0.881559	0.923047	0.919501
ucr	NaN	NaN	0.963125	0.999977	0.999305	0.966905	0.954344	0.922883	0.954292	0.950571
ucrC	NaN	NaN	NaN	0.963242	0.963341	0.933715	0.925086	0.916942	0.925425	0.925011
ucrss	NaN	NaN	NaN	NaN	0.999292	0.966773	0.954287	0.923035	0.954257	0.950519
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.966236	0.953421	0.923123	0.953378	0.952243
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.927224	0.886843	0.927174	0.923658
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.884533	0.999968	0.996988
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.884863	0.884356
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.996978
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.999992	0.998880	0.999992	0.999990	0.999989	0.999989	0.997646	0.999964	0.999987
ucr	NaN	NaN	0.998881	0.999999	0.999999	0.999991	0.999995	0.997640	0.999969	0.999994
ucrC	NaN	NaN	NaN	0.998878	0.998882	0.998895	0.998887	0.999008	0.998880	0.998878
ucrss	NaN	NaN	NaN	NaN	0.999998	0.999991	0.999995	0.997638	0.999970	0.999993
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.999991	0.999995	0.997641	0.999967	0.999995
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.999997	0.997654	0.999972	0.999993
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.997648	0.999976	0.999996
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.997641	0.997641
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.999968
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	uc	ucr	ucrC	ucrss	ucrss_wfilter	uc_fast	ucr_fast	ucrC_fast	ucrss_fast	ucrss_fast_wfilter
uc	NaN	0.958787	0.899610	0.958781	0.958771	0.999723	0.912856	0.878695	0.913000	0.912872
ucr	NaN	NaN	0.917773	0.999997	0.999996	0.956702	0.966950	0.871130	0.966822	0.966951
ucrC	NaN	NaN	NaN	0.917759	0.917776	0.896388	0.892513	0.934549	0.892270	0.892576
ucrss	NaN	NaN	NaN	NaN	0.999993	0.956695	0.966942	0.871108	0.966815	0.966944
ucrss_wfilter	NaN	NaN	NaN	NaN	NaN	0.956685	0.966952	0.871175	0.966821	0.966957
uc_fast	NaN	NaN	NaN	NaN	NaN	NaN	0.911841	0.875543	0.911992	0.911853
ucr_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.854585	0.999900	0.999989
ucrC_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.854332	0.854682
ucrss_fast	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.999887
ucrss_fast_wfilter	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

data set id	88-soil	moving-picture	whole-body
uc	1220	27748	1095
ucr	1358	46576	1082
ucrC	226	28572	388
ucrss	1493	47207	1212
ucrss_wfilter	1885	76061	2088
uc_fast	914	23510	489
ucr_fast	1052	19371	621
ucrC_fast	44	2428	68
ucrss_fast	1021	23710	707
ucrss_fast_wfilter	1525	52811	1661

data set id	moving-picture
abbreviation
ucr_fast_O29_r82	21737
ucr_fast_O29_r97	16241
ucrss_fast_O29_r82	17812
ucrss_fast_O29_r97	16169
ucrss_fast_O29_s1	14911

	Average de novo OTUs (10K sequences per sample)	SD de novo OTUs (10K sequences per sample)	Average Reference OTUs (10k sequences per sample)	SD Reference OTUs (10k sequences per sample)	% novel diversity (10k seqs per sample)	% error novel diversity (10K seqs per sample)	number of samples
EnvironmentalBiome
mangrove biome	2169	1159	354	73	0.860	0.460	7
tropical humid forests	2398	260	397	35	0.858	0.094	26
tundra biome	1771	403	312	117	0.850	0.201	110
deserts and xeric shrubland biome	3917	127	707	15	0.847	0.028	7
taiga	2598	102	505	35	0.837	0.035	4
marine biome	2040	1048	484	410	0.808	0.446	890
aquatic biome	714	299	177	199	0.801	0.403	762
freshwater biome	768	541	194	120	0.798	0.576	375
warm deserts and semideserts	2386	473	607	147	0.797	0.166	97
tropical and subtropical moist broadleaf forest biome	3072	125	846	18	0.784	0.032	2
temperate needle-leaf forests or woodlands	2836	159	785	132	0.783	0.057	21
polar biome	1721	886	483	218	0.781	0.414	277
tropical and subtropical coniferous forest biome	1993	256	579	94	0.775	0.106	3
mixed island systems	1552	618	511	203	0.752	0.315	124
marginal sea	1795	325	611	225	0.746	0.164	7
temperate coniferous forest biome	2504	1206	885	201	0.739	0.361	19
mediterranean forests, woodlands, and shrub biome	695	361	275	195	0.717	0.424	371
large river biome	1844	629	743	369	0.713	0.282	5
terrestrial biome	2714	222	1138	163	0.705	0.072	627
nest of bird	821	276	355	138	0.698	0.262	313
Temperate broadleaf and mixed forest biome	1910	491	879	235	0.685	0.195	14
temperate grasslands	2745	290	1315	164	0.676	0.082	696
animal-associated habitat	758	329	376	240	0.668	0.359	1036
mammalia-associated habitat	973	357	583	222	0.625	0.270	1918
Cold-winter (continental) deserts and semideserts	847	210	551	215	0.606	0.215	102
Temperate grasslands, savannas, and shrubland biome	1688	272	1497	275	0.530	0.121	85
human-associated habitat	292	242	590	366	0.331	0.498	1597

	taxonomy	Test-Statistic
OTU
113212	k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...	55.858724
1123837	k__Bacteria;p__Actinobacteria;c__Rubrobacteria...	50.432793
New.ReferenceOTU22	k__Bacteria;p__Actinobacteria;c__Actinobacteri...	49.171544
252012	k__Bacteria;p__Proteobacteria;c__Gammaproteoba...	48.649686
843189	k__Bacteria;p__Acidobacteria;c__Solibacteres;o...	47.005758
1127423	k__Bacteria;p__Acidobacteria;c__Acidobacteriia...	43.869989
1129210	k__Bacteria;p__Acidobacteria;c__Acidobacteriia...	43.804418
831520	k__Bacteria;p__Actinobacteria;c__Rubrobacteria...	43.624935
1139779	k__Bacteria;p__Proteobacteria;c__Alphaproteoba...	41.862548
804187	k__Bacteria;p__Acidobacteria;c__[Chloracidobac...	41.151350

	taxonomy	Test-Statistic
OTU
368134	k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...	1599.696001
3154070	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	1625.703076
1000986	k__Bacteria;p__Actinobacteria;c__Actinobacteri...	1630.009468
1992	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	1728.164420
4304475	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	1545.444829
191238	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	1546.435699
187665	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	1474.529300
4396297	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	1585.014965
3903651	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	1670.188332
3472078	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	1783.487973

	taxonomy	Test-Statistic
OTU
4326219	k__Bacteria;p__Proteobacteria;c__Epsilonproteo...	363.881085
New.CleanUp.ReferenceOTU222	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	358.019614
4325533	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	349.852497
New.CleanUp.ReferenceOTU17550	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	337.655669
316732	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	337.309334
4346374	k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...	331.432928
4458959	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	329.771577
3866487	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	323.487958
4391641	k__Bacteria;p__Proteobacteria;c__Gammaproteoba...	311.999639
175751	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...	305.530588