In [1]:
from glob import glob
from os.path import split, splitext
from collections import defaultdict

import pandas as pd
import numpy as np
from scipy import stats
from skbio.draw.distributions import boxplots

data_set_ids = ['88-soils', 'whole-body', 'moving-pictures']
order = ['uc','ucr','ucrC','ucrss','ucrss_wfilter',
         'uc_fast','ucr_fast','ucrC_fast','ucrss_fast','ucrss_fast_wfilter']

method_descriptions = pd.io.parsers.read_csv('raw-tables/method-descriptions.tsv', sep='\t', index_col=0)

write_xls_files = False

Table 1: Method descriptions


In [2]:
if write_xls_files: method_descriptions.to_excel('tables/table1.xlsx', na_rep='NA', float_format="%1.3f")
method_descriptions


Out[2]:
title command max_accepts max_rejects stepwords wordlength prefilter_percent_id min_otu_size speed_mode processors reference_percent_id subsample_fraction
abbreviation
uc De novo pick_de_novo_otus.py 20 500 20 12 NaN NaN slow 1 0.97 NaN
ucr Legacy open reference pick_de_novo_otus.py 20 500 20 12 NaN NaN slow 10 0.97 NaN
ucrC Closed reference pick_closed_reference_otus.py 20 500 20 12 NaN NaN slow 10 0.97 NaN
ucrss Subsampled open reference pick_open_reference_otus.py 20 500 20 12 0.0 1 slow 10 0.97 0.001
ucrss_wfilter Subsampled open reference, filtered pick_open_reference_otus.py 20 500 20 12 0.6 1 slow 10 0.97 0.001
uc_fast De novo, fast settings pick_de_novo_otus.py 1 8 8 8 NaN NaN fast 1 0.97 NaN
ucr_fast Legacy open reference, fast settings pick_de_novo_otus.py 1 8 8 8 NaN NaN fast 10 0.97 NaN
ucrC_fast Closed reference, fast settings pick_closed_reference_otus.py 1 8 8 8 NaN NaN fast 10 0.97 NaN
ucrss_fast Subsampled open reference, fast settings pick_open_reference_otus.py 1 8 8 8 0.0 1 fast 10 0.97 0.001
ucrss_fast_wfilter Subsampled open reference, filtered, fast sett... pick_open_reference_otus.py 1 8 8 8 0.6 1 fast 10 0.97 0.001
ucr_fast_O29_r82 Legacy open reference, fast settings, 82% refe... pick_de_novo_otus.py 1 8 8 8 0.0 1 fast 29 0.82 0.001
ucr_fast_O29_r97 Legacy open reference, fast settings, 29 proce... pick_de_novo_otus.py 1 8 8 8 0.0 1 fast 29 0.97 0.001
ucrss_fast_O29_r82 Subsampled open reference, fast settings, 82% ... pick_open_reference_otus.py 1 8 8 8 0.0 1 fast 29 0.82 0.001
ucrss_fast_O29_r97 Subsampled open reference, fast settings, 29 p... pick_open_reference_otus.py 1 8 8 8 0.0 1 fast 29 0.97 0.001
ucrss_fast_O29_s1 Subsampled open reference, fast settings, 29 p... pick_open_reference_otus.py 1 8 8 8 0.0 1 fast 29 0.97 0.100

Table 2: Alpha diversity correlations by method


In [3]:
adiv_fp_sets = [glob('%s-otus/*/adiv*txt' % id_) for id_ in data_set_ids]
adiv_results = {}
for data_set_id, adiv_fp_set in zip(data_set_ids, adiv_fp_sets):
    df = None
    for fp in adiv_fp_set:
        run_id = fp.split('/')[1]
        new_df = pd.DataFrame.from_csv(fp, sep='\t')
        new_df.columns = ['%s.%s' % (c.lower(), run_id) for c in new_df.columns]
        if df is None:
            df = new_df
        else:
            df = pd.merge(df, new_df,right_index=True,left_index=True)
    for metric in ['pd', 'observed_species']:
        id_ = (data_set_id,metric)
        # this loc-based indexing seems clunky - there must be a better way to 
        # do this in pandas (basically, select all columns where the name contains
        # a string)
        corr_table = df.loc[:,[c for c in df.columns if metric in c]].corr()
        # rename and re-order the rows and columns
        corr_table.index = [c.split('.')[1] for c in corr_table.index]
        corr_table.columns = [c.split('.')[1] for c in corr_table.columns]
        corr_table = corr_table.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
        adiv_results[id_] = corr_table

In [4]:
if write_xls_files: adiv_results[('88-soils', 'pd')].to_excel('tables/table2a.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'pd')]


Out[4]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.950810 0.932749 0.933510 0.952565 0.955606 0.936157 0.926662 0.947673 0.946606
ucr 0.950810 1.000000 0.902187 0.930701 0.930193 0.945996 0.940277 0.903199 0.952477 0.943609
ucrC 0.932749 0.902187 1.000000 0.893631 0.908745 0.905111 0.914053 0.977595 0.902405 0.910766
ucrss 0.933510 0.930701 0.893631 1.000000 0.929412 0.944127 0.935030 0.893611 0.947867 0.948666
ucrss_wfilter 0.952565 0.930193 0.908745 0.929412 1.000000 0.951712 0.933274 0.902546 0.930598 0.943046
uc_fast 0.955606 0.945996 0.905111 0.944127 0.951712 1.000000 0.952704 0.898282 0.956454 0.960493
ucr_fast 0.936157 0.940277 0.914053 0.935030 0.933274 0.952704 1.000000 0.914263 0.950392 0.952351
ucrC_fast 0.926662 0.903199 0.977595 0.893611 0.902546 0.898282 0.914263 1.000000 0.902216 0.902507
ucrss_fast 0.947673 0.952477 0.902405 0.947867 0.930598 0.956454 0.950392 0.902216 1.000000 0.962338
ucrss_fast_wfilter 0.946606 0.943609 0.910766 0.948666 0.943046 0.960493 0.952351 0.902507 0.962338 1.000000

In [5]:
if write_xls_files: adiv_results[('moving-pictures', 'pd')].to_excel('tables/table2b.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'pd')]


Out[5]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.995736 0.992960 0.995918 0.995596 0.994955 0.995661 0.992341 0.995695 0.995864
ucr 0.995736 1.000000 0.992644 0.996527 0.997235 0.995112 0.995891 0.992371 0.995859 0.996714
ucrC 0.992960 0.992644 1.000000 0.994053 0.991472 0.994132 0.993919 0.997675 0.994638 0.994202
ucrss 0.995918 0.996527 0.994053 1.000000 0.996130 0.996035 0.996789 0.993736 0.996859 0.997034
ucrss_wfilter 0.995596 0.997235 0.991472 0.996130 1.000000 0.994251 0.995429 0.991241 0.995604 0.996436
uc_fast 0.994955 0.995112 0.994132 0.996035 0.994251 1.000000 0.996530 0.994057 0.996510 0.996299
ucr_fast 0.995661 0.995891 0.993919 0.996789 0.995429 0.996530 1.000000 0.993591 0.996812 0.996927
ucrC_fast 0.992341 0.992371 0.997675 0.993736 0.991241 0.994057 0.993591 1.000000 0.994279 0.993998
ucrss_fast 0.995695 0.995859 0.994638 0.996859 0.995604 0.996510 0.996812 0.994279 1.000000 0.996822
ucrss_fast_wfilter 0.995864 0.996714 0.994202 0.997034 0.996436 0.996299 0.996927 0.993998 0.996822 1.000000

In [6]:
if write_xls_files: adiv_results[('whole-body', 'pd')].to_excel('tables/table2c.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'pd')]


Out[6]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.985137 0.957040 0.984981 0.984775 0.984090 0.985665 0.961456 0.982975 0.984321
ucr 0.985137 1.000000 0.955564 0.989950 0.988868 0.987748 0.987070 0.959910 0.987496 0.985933
ucrC 0.957040 0.955564 1.000000 0.961397 0.958381 0.958510 0.960640 0.990411 0.952869 0.961033
ucrss 0.984981 0.989950 0.961397 1.000000 0.990865 0.987546 0.989638 0.964492 0.989253 0.987182
ucrss_wfilter 0.984775 0.988868 0.958381 0.990865 1.000000 0.985185 0.988750 0.963320 0.987431 0.984912
uc_fast 0.984090 0.987748 0.958510 0.987546 0.985185 1.000000 0.986204 0.960893 0.986490 0.985077
ucr_fast 0.985665 0.987070 0.960640 0.989638 0.988750 0.986204 1.000000 0.964924 0.987977 0.988788
ucrC_fast 0.961456 0.959910 0.990411 0.964492 0.963320 0.960893 0.964924 1.000000 0.957054 0.965236
ucrss_fast 0.982975 0.987496 0.952869 0.989253 0.987431 0.986490 0.987977 0.957054 1.000000 0.986223
ucrss_fast_wfilter 0.984321 0.985933 0.961033 0.987182 0.984912 0.985077 0.988788 0.965236 0.986223 1.000000

In [7]:
if write_xls_files: adiv_results[('88-soils', 'observed_species')].to_excel('tables/table2d.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'observed_species')]


Out[7]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.947875 0.879873 0.909483 0.924238 0.935259 0.933534 0.876867 0.924938 0.912961
ucr 0.947875 1.000000 0.904662 0.946052 0.947025 0.947314 0.952570 0.902627 0.938155 0.932401
ucrC 0.879873 0.904662 1.000000 0.925964 0.888274 0.882334 0.907653 0.972994 0.910432 0.895664
ucrss 0.909483 0.946052 0.925964 1.000000 0.931844 0.923371 0.934916 0.915025 0.930894 0.928529
ucrss_wfilter 0.924238 0.947025 0.888274 0.931844 1.000000 0.943087 0.946177 0.884305 0.932178 0.926691
uc_fast 0.935259 0.947314 0.882334 0.923371 0.943087 1.000000 0.942244 0.882958 0.940951 0.939832
ucr_fast 0.933534 0.952570 0.907653 0.934916 0.946177 0.942244 1.000000 0.908362 0.942923 0.931863
ucrC_fast 0.876867 0.902627 0.972994 0.915025 0.884305 0.882958 0.908362 1.000000 0.904095 0.905611
ucrss_fast 0.924938 0.938155 0.910432 0.930894 0.932178 0.940951 0.942923 0.904095 1.000000 0.952921
ucrss_fast_wfilter 0.912961 0.932401 0.895664 0.928529 0.926691 0.939832 0.931863 0.905611 0.952921 1.000000

In [8]:
if write_xls_files: adiv_results[('moving-pictures', 'observed_species')].to_excel('tables/table2e.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'observed_species')]


Out[8]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.992461 0.983973 0.992078 0.992313 0.988647 0.990092 0.978429 0.989429 0.989610
ucr 0.992461 1.000000 0.994303 0.998253 0.998186 0.991920 0.996989 0.990556 0.996875 0.997030
ucrC 0.983973 0.994303 1.000000 0.994695 0.994748 0.983908 0.993300 0.996874 0.993777 0.993611
ucrss 0.992078 0.998253 0.994695 1.000000 0.998285 0.991935 0.997188 0.991311 0.997034 0.997209
ucrss_wfilter 0.992313 0.998186 0.994748 0.998285 1.000000 0.991651 0.997157 0.991281 0.996954 0.997097
uc_fast 0.988647 0.991920 0.983908 0.991935 0.991651 1.000000 0.992937 0.980686 0.991777 0.991937
ucr_fast 0.990092 0.996989 0.993300 0.997188 0.997157 0.992937 1.000000 0.992169 0.997906 0.998061
ucrC_fast 0.978429 0.990556 0.996874 0.991311 0.991281 0.980686 0.992169 1.000000 0.992683 0.992397
ucrss_fast 0.989429 0.996875 0.993777 0.997034 0.996954 0.991777 0.997906 0.992683 1.000000 0.997988
ucrss_fast_wfilter 0.989610 0.997030 0.993611 0.997209 0.997097 0.991937 0.998061 0.992397 0.997988 1.000000

In [9]:
if write_xls_files: adiv_results[('whole-body', 'observed_species')].to_excel('tables/table2f.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'observed_species')]


Out[9]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc 1.000000 0.986013 0.970654 0.986054 0.986288 0.992712 0.987933 0.971591 0.988239 0.987425
ucr 0.986013 1.000000 0.983916 0.995433 0.995315 0.986973 0.992700 0.979658 0.992898 0.992813
ucrC 0.970654 0.983916 1.000000 0.984706 0.983561 0.970210 0.980584 0.992099 0.980124 0.979129
ucrss 0.986054 0.995433 0.984706 1.000000 0.995365 0.986947 0.992549 0.980568 0.992644 0.992455
ucrss_wfilter 0.986288 0.995315 0.983561 0.995365 1.000000 0.986130 0.992597 0.979232 0.991829 0.992433
uc_fast 0.992712 0.986973 0.970210 0.986947 0.986130 1.000000 0.988851 0.971697 0.989586 0.988207
ucr_fast 0.987933 0.992700 0.980584 0.992549 0.992597 0.988851 1.000000 0.981125 0.994334 0.994478
ucrC_fast 0.971591 0.979658 0.992099 0.980568 0.979232 0.971697 0.981125 1.000000 0.981586 0.979289
ucrss_fast 0.988239 0.992898 0.980124 0.992644 0.991829 0.989586 0.994334 0.981586 1.000000 0.994629
ucrss_fast_wfilter 0.987425 0.992813 0.979129 0.992455 0.992433 0.988207 0.994478 0.979289 0.994629 1.000000

Table 3: Beta diversity correlation by method


In [10]:
bdiv_results = {}
for metric in ['unweighted_unifrac', 'weighted_unifrac']:
    for data_set_id in data_set_ids:
        id_ = (data_set_id,metric)
        fp = '%s-otus/%s_mantel_results/mantel_results.txt' % id_
        df = pd.DataFrame.from_csv(fp, sep='\t', header=4, index_col=False)
        df = df.pivot(index='DM1', columns='DM2', values='Mantel r statistic')
        # re-name and re-order rows and columns
        df.index = [e.split('/')[1] for e in df.index]
        df.columns = [e.split('/')[1] for e in df.columns]
        df = df.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
        bdiv_results[id_] = df

In [11]:
if write_xls_files: bdiv_results[('88-soils', 'unweighted_unifrac')].to_excel('tables/table3a.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'unweighted_unifrac')]


Out[11]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.93487 0.90823 0.94364 0.94208 0.93926 0.94472 0.90865 0.94259 0.94149
ucr NaN NaN 0.91458 0.93982 0.94465 0.93425 0.94212 0.91776 0.94393 0.94881
ucrC NaN NaN NaN 0.91656 0.90959 0.92586 0.91308 0.94977 0.91709 0.91966
ucrss NaN NaN NaN NaN 0.93954 0.93791 0.94464 0.91370 0.93830 0.94230
ucrss_wfilter NaN NaN NaN NaN NaN 0.93439 0.94291 0.90714 0.94156 0.94136
uc_fast NaN NaN NaN NaN NaN NaN 0.93829 0.91969 0.93888 0.94116
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.90853 0.94642 0.94651
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.91736 0.92433
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.94494
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [12]:
if write_xls_files: bdiv_results[('moving-pictures', 'unweighted_unifrac')].to_excel('tables/table3b.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'unweighted_unifrac')]


Out[12]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.99196 0.97423 0.98823 0.98824 0.99195 0.99050 0.97744 0.99128 0.99194
ucr NaN NaN 0.98180 0.99162 0.99146 0.99107 0.99236 0.98375 0.99256 0.99300
ucrC NaN NaN NaN 0.98624 0.98470 0.97287 0.98162 0.99387 0.98050 0.98086
ucrss NaN NaN NaN NaN 0.98983 0.98847 0.99174 0.98729 0.99157 0.99134
ucrss_wfilter NaN NaN NaN NaN NaN 0.98587 0.98971 0.98566 0.98950 0.99055
uc_fast NaN NaN NaN NaN NaN NaN 0.99087 0.97569 0.99175 0.99148
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.98330 0.99293 0.99220
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.98231 0.98313
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.99275
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [13]:
if write_xls_files: bdiv_results[('whole-body', 'unweighted_unifrac')].to_excel('tables/table3c.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'unweighted_unifrac')]


Out[13]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.93502 0.89081 0.93757 0.93633 0.92982 0.92633 0.88941 0.93279 0.92547
ucr NaN NaN 0.89946 0.94835 0.95012 0.93418 0.93139 0.89533 0.94144 0.92738
ucrC NaN NaN NaN 0.90765 0.89852 0.87809 0.88532 0.95207 0.89723 0.87790
ucrss NaN NaN NaN NaN 0.95252 0.93798 0.93623 0.90499 0.94548 0.92768
ucrss_wfilter NaN NaN NaN NaN NaN 0.93661 0.93996 0.89442 0.94125 0.93176
uc_fast NaN NaN NaN NaN NaN NaN 0.94199 0.87167 0.93906 0.93754
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.88760 0.93944 0.94811
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.89139 0.87890
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.93340
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [14]:
if write_xls_files: bdiv_results[('88-soils', 'weighted_unifrac')].to_excel('tables/table3d.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'weighted_unifrac')]


Out[14]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.89611 0.93569 0.95121 0.90142 0.92525 0.93708 0.92377 0.95616 0.90235
ucr NaN NaN 0.89623 0.88943 0.96587 0.89103 0.93933 0.89456 0.90146 0.94684
ucrC NaN NaN NaN 0.91903 0.91355 0.90626 0.92836 0.98438 0.93128 0.89627
ucrss NaN NaN NaN NaN 0.89965 0.91691 0.94744 0.90325 0.94914 0.89874
ucrss_wfilter NaN NaN NaN NaN NaN 0.88544 0.93761 0.91087 0.89856 0.93959
uc_fast NaN NaN NaN NaN NaN NaN 0.90934 0.89783 0.91872 0.87395
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.92017 0.95189 0.95986
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.91809 0.89003
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.91784
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [15]:
if write_xls_files: bdiv_results[('moving-pictures', 'weighted_unifrac')].to_excel('tables/table3e.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'weighted_unifrac')]


Out[15]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.97127 0.94939 0.96991 0.97333 0.97169 0.97675 0.94938 0.97358 0.96645
ucr NaN NaN 0.92765 0.95164 0.95177 0.95722 0.95820 0.92801 0.95995 0.95402
ucrC NaN NaN NaN 0.96042 0.94021 0.94837 0.93388 0.99940 0.96532 0.93212
ucrss NaN NaN NaN NaN 0.93761 0.96519 0.95474 0.95964 0.98007 0.93228
ucrss_wfilter NaN NaN NaN NaN NaN 0.94615 0.96638 0.94095 0.95057 0.96651
uc_fast NaN NaN NaN NaN NaN NaN 0.96951 0.94818 0.97086 0.94925
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.93403 0.96671 0.96732
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.96474 0.93239
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.95120
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [16]:
if write_xls_files: bdiv_results[('whole-body', 'weighted_unifrac')].to_excel('tables/table3f.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'weighted_unifrac')]


Out[16]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.94742 0.89564 0.93433 0.94257 0.95997 0.93908 0.89794 0.90381 0.93587
ucr NaN NaN 0.89958 0.92423 0.94983 0.95082 0.92024 0.90377 0.87088 0.94386
ucrC NaN NaN NaN 0.88576 0.92416 0.90725 0.91116 0.99398 0.83136 0.93889
ucrss NaN NaN NaN NaN 0.94404 0.92023 0.91664 0.88246 0.91841 0.91070
ucrss_wfilter NaN NaN NaN NaN NaN 0.93333 0.91832 0.92575 0.89680 0.93162
uc_fast NaN NaN NaN NaN NaN NaN 0.95490 0.90857 0.88950 0.96583
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.91048 0.93554 0.95136
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.82984 0.94028
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.86558
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Table 4: Taxa summary correlation by method


In [17]:
taxa_corr_results = {}
for data_set_id in data_set_ids:
    taxa_corr_results[data_set_id] = {}
    for level in range(2,8):
        df = pd.DataFrame.from_csv("%s-otus/taxa_correlations/level_%d.csv" % (data_set_id, level))
        taxa_corr_results[data_set_id][level] = df.copy()

In [18]:
if write_xls_files: taxa_corr_results['88-soils'][2].to_excel('tables/table4a.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][2]


Out[18]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.999998 0.983181 0.999997 0.999998 0.999996 0.999996 0.981320 0.999996 0.999995
ucr NaN NaN 0.983160 0.999999 0.999999 0.999996 0.999997 0.981302 0.999997 0.999996
ucrC NaN NaN NaN 0.983147 0.983165 0.983156 0.983157 0.999455 0.983152 0.983168
ucrss NaN NaN NaN NaN 0.999998 0.999995 0.999996 0.981290 0.999996 0.999995
ucrss_wfilter NaN NaN NaN NaN NaN 0.999995 0.999996 0.981309 0.999996 0.999995
uc_fast NaN NaN NaN NaN NaN NaN 0.999998 0.981299 0.999999 0.999998
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.981299 1.000000 1.000000
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.981294 0.981310
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.999999
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [19]:
if write_xls_files: taxa_corr_results['88-soils'][7].to_excel('tables/table4b.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][7]


Out[19]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.939448 0.849721 0.939429 0.939433 0.999694 0.939752 0.839735 0.939631 0.939830
ucr NaN NaN 0.820697 0.999992 0.999990 0.939538 0.998175 0.923148 0.998058 0.998126
ucrC NaN NaN NaN 0.820677 0.820729 0.849934 0.819635 0.818193 0.819658 0.820185
ucrss NaN NaN NaN NaN 0.999982 0.939520 0.998170 0.923148 0.998053 0.998120
ucrss_wfilter NaN NaN NaN NaN NaN 0.939526 0.998167 0.923154 0.998050 0.998120
uc_fast NaN NaN NaN NaN NaN NaN 0.939894 0.840394 0.939776 0.939982
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.920705 0.999882 0.999942
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.920747 0.920906
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.999826
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [20]:
if write_xls_files: taxa_corr_results['moving-pictures'][2].to_excel('tables/table4c.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][2]


Out[20]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.999997 0.997003 0.999992 0.999659 0.999997 0.999996 0.996827 0.999987 0.998449
ucr NaN NaN 0.997004 0.999998 0.999665 1.000000 1.000000 0.996831 0.999993 0.998460
ucrC NaN NaN NaN 0.997000 0.997387 0.996999 0.997000 0.999844 0.997001 0.997601
ucrss NaN NaN NaN NaN 0.999665 0.999998 0.999999 0.996828 0.999995 0.998460
ucrss_wfilter NaN NaN NaN NaN NaN 0.999664 0.999665 0.997216 0.999660 0.999489
uc_fast NaN NaN NaN NaN NaN NaN 1.000000 0.996824 0.999993 0.998460
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.996826 0.999994 0.998461
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.996830 0.997426
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.998457
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [21]:
if write_xls_files: taxa_corr_results['moving-pictures'][7].to_excel('tables/table4d.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][7]


Out[21]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.96386 0.928748 0.963730 0.963212 0.999477 0.923112 0.881559 0.923047 0.919501
ucr NaN NaN 0.963125 0.999977 0.999305 0.966905 0.954344 0.922883 0.954292 0.950571
ucrC NaN NaN NaN 0.963242 0.963341 0.933715 0.925086 0.916942 0.925425 0.925011
ucrss NaN NaN NaN NaN 0.999292 0.966773 0.954287 0.923035 0.954257 0.950519
ucrss_wfilter NaN NaN NaN NaN NaN 0.966236 0.953421 0.923123 0.953378 0.952243
uc_fast NaN NaN NaN NaN NaN NaN 0.927224 0.886843 0.927174 0.923658
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.884533 0.999968 0.996988
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.884863 0.884356
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.996978
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [22]:
if write_xls_files: taxa_corr_results['whole-body'][2].to_excel('tables/table4e.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][2]


Out[22]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.999992 0.998880 0.999992 0.999990 0.999989 0.999989 0.997646 0.999964 0.999987
ucr NaN NaN 0.998881 0.999999 0.999999 0.999991 0.999995 0.997640 0.999969 0.999994
ucrC NaN NaN NaN 0.998878 0.998882 0.998895 0.998887 0.999008 0.998880 0.998878
ucrss NaN NaN NaN NaN 0.999998 0.999991 0.999995 0.997638 0.999970 0.999993
ucrss_wfilter NaN NaN NaN NaN NaN 0.999991 0.999995 0.997641 0.999967 0.999995
uc_fast NaN NaN NaN NaN NaN NaN 0.999997 0.997654 0.999972 0.999993
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.997648 0.999976 0.999996
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.997641 0.997641
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.999968
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [23]:
if write_xls_files: taxa_corr_results['whole-body'][7].to_excel('tables/table4f.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][7]


Out[23]:
uc ucr ucrC ucrss ucrss_wfilter uc_fast ucr_fast ucrC_fast ucrss_fast ucrss_fast_wfilter
uc NaN 0.958787 0.899610 0.958781 0.958771 0.999723 0.912856 0.878695 0.913000 0.912872
ucr NaN NaN 0.917773 0.999997 0.999996 0.956702 0.966950 0.871130 0.966822 0.966951
ucrC NaN NaN NaN 0.917759 0.917776 0.896388 0.892513 0.934549 0.892270 0.892576
ucrss NaN NaN NaN NaN 0.999993 0.956695 0.966942 0.871108 0.966815 0.966944
ucrss_wfilter NaN NaN NaN NaN NaN 0.956685 0.966952 0.871175 0.966821 0.966957
uc_fast NaN NaN NaN NaN NaN NaN 0.911841 0.875543 0.911992 0.911853
ucr_fast NaN NaN NaN NaN NaN NaN NaN 0.854585 0.999900 0.999989
ucrC_fast NaN NaN NaN NaN NaN NaN NaN NaN 0.854332 0.854682
ucrss_fast NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.999887
ucrss_fast_wfilter NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Table 5: Run time by method


In [24]:
from datetime import datetime
from calendar import month_abbr

month_lookup = {v: k for k,v in enumerate(month_abbr)}

In [25]:
log_fps = glob('*/*/log*txt')

results = []
for log_fp in log_fps:
    fp_fields = log_fp.split('/')
    data_set_id = fp_fields[0].strip('-otus')
    run_id = fp_fields[1]
    d = !grep "^Logging" $log_fp
    start_fields = d[0].split()
    start_hour, start_minute, start_second = map(int, start_fields[3].split(':'))
    start_day, start_month, start_year = map(int, [start_fields[5], month_lookup[start_fields[6]], start_fields[7]])
    end_fields = d[1].split()
    end_hour, end_minute, end_second = map(int, end_fields[3].split(':'))
    end_day, end_month, end_year = map(int, [end_fields[5], month_lookup[end_fields[6]], end_fields[7]])
    start_datetime = datetime(start_year, start_month, start_day, start_hour, start_minute, start_second)
    end_datetime = datetime(end_year, end_month, end_day, end_hour, end_minute, end_second)
    runtime = (end_datetime - start_datetime).seconds
    results.append((run_id, data_set_id, runtime))

df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] <= 10]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')
df = df.reindex_axis(order, axis=0)

In [26]:
if write_xls_files: df.to_excel('tables/table5.xlsx', na_rep='NA', float_format="%1.3f")
df


Out[26]:
data set id 88-soil moving-picture whole-body
uc 1220 27748 1095
ucr 1358 46576 1082
ucrC 226 28572 388
ucrss 1493 47207 1212
ucrss_wfilter 1885 76061 2088
uc_fast 914 23510 489
ucr_fast 1052 19371 621
ucrC_fast 44 2428 68
ucrss_fast 1021 23710 707
ucrss_fast_wfilter 1525 52811 1661

Table 6: Subsampled OTU parameter variations


In [27]:
df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] == 29]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')

In [28]:
if write_xls_files: df.to_excel('tables/table6.xlsx', na_rep='NA', float_format="%1.3f")
df


Out[28]:
data set id moving-picture
abbreviation
ucr_fast_O29_r82 21737
ucr_fast_O29_r97 16241
ucrss_fast_O29_r82 17812
ucrss_fast_O29_r97 16169
ucrss_fast_O29_s1 14911

In [29]:
df['moving-picture']['ucr_fast_O29_r82'] - df['moving-picture']['ucrss_fast_O29_r82']


Out[29]:
3925

In [30]:
df['moving-picture']['ucr_fast_O29_r97'] - df['moving-picture']['ucrss_fast_O29_r97']


Out[30]:
72

Table 7: Novel OTUs by biome


In [31]:
novel_by_biome = pd.io.parsers.read_csv('raw-tables/fraction-novel-by-env-biome.tsv', sep='\t', index_col=0)
if write_xls_files: novel_by_biome.to_excel('tables/table7.xlsx', na_rep='NA', float_format="%1.3f")
novel_by_biome


Out[31]:
Average de novo OTUs (10K sequences per sample) SD de novo OTUs (10K sequences per sample) Average Reference OTUs (10k sequences per sample) SD Reference OTUs (10k sequences per sample) % novel diversity (10k seqs per sample) % error novel diversity (10K seqs per sample) number of samples
EnvironmentalBiome
mangrove biome 2169 1159 354 73 0.860 0.460 7
tropical humid forests 2398 260 397 35 0.858 0.094 26
tundra biome 1771 403 312 117 0.850 0.201 110
deserts and xeric shrubland biome 3917 127 707 15 0.847 0.028 7
taiga 2598 102 505 35 0.837 0.035 4
marine biome 2040 1048 484 410 0.808 0.446 890
aquatic biome 714 299 177 199 0.801 0.403 762
freshwater biome 768 541 194 120 0.798 0.576 375
warm deserts and semideserts 2386 473 607 147 0.797 0.166 97
tropical and subtropical moist broadleaf forest biome 3072 125 846 18 0.784 0.032 2
temperate needle-leaf forests or woodlands 2836 159 785 132 0.783 0.057 21
polar biome 1721 886 483 218 0.781 0.414 277
tropical and subtropical coniferous forest biome 1993 256 579 94 0.775 0.106 3
mixed island systems 1552 618 511 203 0.752 0.315 124
marginal sea 1795 325 611 225 0.746 0.164 7
temperate coniferous forest biome 2504 1206 885 201 0.739 0.361 19
mediterranean forests, woodlands, and shrub biome 695 361 275 195 0.717 0.424 371
large river biome 1844 629 743 369 0.713 0.282 5
terrestrial biome 2714 222 1138 163 0.705 0.072 627
nest of bird 821 276 355 138 0.698 0.262 313
Temperate broadleaf and mixed forest biome 1910 491 879 235 0.685 0.195 14
temperate grasslands 2745 290 1315 164 0.676 0.082 696
animal-associated habitat 758 329 376 240 0.668 0.359 1036
mammalia-associated habitat 973 357 583 222 0.625 0.270 1918
Cold-winter (continental) deserts and semideserts 847 210 551 215 0.606 0.215 102
Temperate grasslands, savannas, and shrubland biome 1688 272 1497 275 0.530 0.121 85
human-associated habitat 292 242 590 366 0.331 0.498 1597

Table 8: Differential OTU representation


In [32]:
gs_fp_sets = [glob('%s-otus/*/group_significance*txt' % id_) for id_ in data_set_ids]
gs_results = {}
n = 25

for data_set_id, gs_fp_set in zip(data_set_ids, gs_fp_sets):
    df = None
    for fp in gs_fp_set:
        run_id = fp.split('/')[1]
        id_ = (data_set_id, run_id)
        new_df = pd.DataFrame.from_csv(fp, sep='\t')
        gs_results[id_] = new_df[['taxonomy', 'Test-Statistic']][:n]

In [33]:
if write_xls_files: gs_results[('88-soils', 'ucrss')][:10].to_excel('tables/table8a.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('88-soils', 'ucrss')][:10]


Out[33]:
taxonomy Test-Statistic
OTU
113212 k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin... 55.858724
1123837 k__Bacteria;p__Actinobacteria;c__Rubrobacteria... 50.432793
New.ReferenceOTU22 k__Bacteria;p__Actinobacteria;c__Actinobacteri... 49.171544
252012 k__Bacteria;p__Proteobacteria;c__Gammaproteoba... 48.649686
843189 k__Bacteria;p__Acidobacteria;c__Solibacteres;o... 47.005758
1127423 k__Bacteria;p__Acidobacteria;c__Acidobacteriia... 43.869989
1129210 k__Bacteria;p__Acidobacteria;c__Acidobacteriia... 43.804418
831520 k__Bacteria;p__Actinobacteria;c__Rubrobacteria... 43.624935
1139779 k__Bacteria;p__Proteobacteria;c__Alphaproteoba... 41.862548
804187 k__Bacteria;p__Acidobacteria;c__[Chloracidobac... 41.151350

In [34]:
gs_results[('88-soils', 'ucrss')]['taxonomy']['New.ReferenceOTU22']


Out[34]:
'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__;g__;s__'

In [35]:
if write_xls_files: gs_results[('moving-pictures', 'ucrss')][:10].to_excel('tables/table8b.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('moving-pictures', 'ucrss')][:10]


Out[35]:
taxonomy Test-Statistic
OTU
368134 k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill... 1599.696001
3154070 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 1625.703076
1000986 k__Bacteria;p__Actinobacteria;c__Actinobacteri... 1630.009468
1992 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 1728.164420
4304475 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 1545.444829
191238 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 1546.435699
187665 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 1474.529300
4396297 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 1585.014965
3903651 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 1670.188332
3472078 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 1783.487973

In [36]:
if write_xls_files: gs_results[('whole-body', 'ucrss')][:10].to_excel('tables/table8c.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('whole-body', 'ucrss')][:10]


Out[36]:
taxonomy Test-Statistic
OTU
4326219 k__Bacteria;p__Proteobacteria;c__Epsilonproteo... 363.881085
New.CleanUp.ReferenceOTU222 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 358.019614
4325533 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 349.852497
New.CleanUp.ReferenceOTU17550 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 337.655669
316732 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 337.309334
4346374 k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_... 331.432928
4458959 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 329.771577
3866487 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 323.487958
4391641 k__Bacteria;p__Proteobacteria;c__Gammaproteoba... 311.999639
175751 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo... 305.530588

In [37]:
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU222']


Out[37]:
'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica'

In [38]:
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU17550']


Out[38]:
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__parvula'

Figure 2: De novo OTU picking run time versus number of sequences


In [39]:
from pylab import savefig

emp_time_dir = 'emp-analysis/uc_time/'
emp_time_fps = glob('%s/*txt' % emp_time_dir)
mems = defaultdict(list)
times = defaultdict(list)
total_seqs = 5594412
for fp in emp_time_fps:
    fn = splitext(split(fp)[1])[0]
    _, subsample_size, iteration = fn.split('_')
    mem, time = open(fp).read().strip().split()
    subsample_seqs = float(subsample_size) * total_seqs
    mems[subsample_seqs].append(float(mem) / 1000000)
    times[subsample_seqs].append(float(time) / 60 / 60 / 24)

In [40]:
time_data = times.items()
time_data.sort()
time_data = time_data[9:]
num_seqs, time_distributions = zip(*time_data)
ax = boxplots(time_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])

slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in time_distributions])
label = ["Runtime (days) = \n  (%1.2e x Sequence Count) + %1.2f" % (slope, intercept),
         "\nr-squared: %1.2f\np=%1.2e" % (r_value, p_value)]

ax.text(0.15,0.70,"\n".join(label))
if write_xls_files: savefig('tables/Figure2.pdf')



In [41]:
def time_in_days(num_sequences):
    return (0.02 * num_sequences + -21618.68) / 60 / 60 / 24

In [42]:
time_in_days(total_seqs)


Out[42]:
1.044786574074074

In [43]:
time_in_days(661202560)


Out[43]:
152.80593194444444

Miscellaneous notes

Confirming that the closed-reference step of ucr and ucrss have equal numbers of sequences failing to hit the reference.

$ wc -l *-otus/ucrss/step1_otus/*_failures.txt *-otus/ucr/uclust_ref_picked_otus/*_failures.txt
    79662 88-soils-otus/ucrss/step1_otus/study_103_split_library_seqs_failures.txt
    79662 88-soils-otus/ucr/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
  2805981 moving-pictures-otus/ucrss/step1_otus/study_550_split_library_seqs_failures.txt
  2805981 moving-pictures-otus/ucr/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
    58736 whole-body-otus/ucrss/step1_otus/study_449_split_library_seqs_failures.txt
    58736 whole-body-otus/ucr/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt

$ wc -l *-otus/ucrss_fast/step1_otus/*_failures.txt *-otus/ucr_fast/uclust_ref_picked_otus/*_failures.txt
    84565 88-soils-otus/ucrss_fast/step1_otus/study_103_split_library_seqs_failures.txt
    84565 88-soils-otus/ucr_fast/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
  3432893 moving-pictures-otus/ucrss_fast/step1_otus/study_550_split_library_seqs_failures.txt
  3432893 moving-pictures-otus/ucr_fast/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
    73254 whole-body-otus/ucrss_fast/step1_otus/study_449_split_library_seqs_failures.txt
    73254 whole-body-otus/ucr_fast/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt

Computing the number of sequences being clusstered in each data set.

count_seqs.py -i $MOVING_PICTURES_SEQS,$SOILS_SEQS,$WHOLE_BODY_SEQS

151387  : /home/caporaso/analysis/88-soils/study_103_split_library_seqs.fna (Sequence lengths (mean +/- std): 230.7818 +/- 11.4761)
792831  : /home/caporaso/analysis/whole-body/study_449_split_library_seqs.fna (Sequence lengths (mean +/- std): 228.5124 +/- 16.0318)
68666081  : /home/caporaso/analysis/moving-pictures/study_550_split_library_seqs.fna (Sequence lengths (mean +/- std): 123.2359 +/- 17.4283)

In [44]:
mem_data = mems.items()
mem_data.sort()
mem_data = mem_data[10:]
num_seqs, mem_distributions = zip(*mem_data)
ax = boxplots(mem_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])

slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in mem_distributions])
print "time = %1.2f * subsample_size + %1.2f" % (slope, intercept)
print "r-squared: %1.2f, p=%f" % (r_value, p_value)


time = 0.00 * subsample_size + -1.30
r-squared: 1.00, p=0.000000

In [44]: