In [1]:
from glob import glob
from os.path import split, splitext
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy import stats
from skbio.draw.distributions import boxplots
data_set_ids = ['88-soils', 'whole-body', 'moving-pictures']
order = ['uc','ucr','ucrC','ucrss','ucrss_wfilter',
'uc_fast','ucr_fast','ucrC_fast','ucrss_fast','ucrss_fast_wfilter']
method_descriptions = pd.io.parsers.read_csv('raw-tables/method-descriptions.tsv', sep='\t', index_col=0)
write_xls_files = False
In [2]:
if write_xls_files: method_descriptions.to_excel('tables/table1.xlsx', na_rep='NA', float_format="%1.3f")
method_descriptions
Out[2]:
In [3]:
adiv_fp_sets = [glob('%s-otus/*/adiv*txt' % id_) for id_ in data_set_ids]
adiv_results = {}
for data_set_id, adiv_fp_set in zip(data_set_ids, adiv_fp_sets):
df = None
for fp in adiv_fp_set:
run_id = fp.split('/')[1]
new_df = pd.DataFrame.from_csv(fp, sep='\t')
new_df.columns = ['%s.%s' % (c.lower(), run_id) for c in new_df.columns]
if df is None:
df = new_df
else:
df = pd.merge(df, new_df,right_index=True,left_index=True)
for metric in ['pd', 'observed_species']:
id_ = (data_set_id,metric)
# this loc-based indexing seems clunky - there must be a better way to
# do this in pandas (basically, select all columns where the name contains
# a string)
corr_table = df.loc[:,[c for c in df.columns if metric in c]].corr()
# rename and re-order the rows and columns
corr_table.index = [c.split('.')[1] for c in corr_table.index]
corr_table.columns = [c.split('.')[1] for c in corr_table.columns]
corr_table = corr_table.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
adiv_results[id_] = corr_table
In [4]:
if write_xls_files: adiv_results[('88-soils', 'pd')].to_excel('tables/table2a.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'pd')]
Out[4]:
In [5]:
if write_xls_files: adiv_results[('moving-pictures', 'pd')].to_excel('tables/table2b.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'pd')]
Out[5]:
In [6]:
if write_xls_files: adiv_results[('whole-body', 'pd')].to_excel('tables/table2c.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'pd')]
Out[6]:
In [7]:
if write_xls_files: adiv_results[('88-soils', 'observed_species')].to_excel('tables/table2d.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('88-soils', 'observed_species')]
Out[7]:
In [8]:
if write_xls_files: adiv_results[('moving-pictures', 'observed_species')].to_excel('tables/table2e.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('moving-pictures', 'observed_species')]
Out[8]:
In [9]:
if write_xls_files: adiv_results[('whole-body', 'observed_species')].to_excel('tables/table2f.xlsx', na_rep='NA', float_format="%1.3f")
adiv_results[('whole-body', 'observed_species')]
Out[9]:
In [10]:
bdiv_results = {}
for metric in ['unweighted_unifrac', 'weighted_unifrac']:
for data_set_id in data_set_ids:
id_ = (data_set_id,metric)
fp = '%s-otus/%s_mantel_results/mantel_results.txt' % id_
df = pd.DataFrame.from_csv(fp, sep='\t', header=4, index_col=False)
df = df.pivot(index='DM1', columns='DM2', values='Mantel r statistic')
# re-name and re-order rows and columns
df.index = [e.split('/')[1] for e in df.index]
df.columns = [e.split('/')[1] for e in df.columns]
df = df.reindex_axis(order,axis=0).reindex_axis(order,axis=1)
bdiv_results[id_] = df
In [11]:
if write_xls_files: bdiv_results[('88-soils', 'unweighted_unifrac')].to_excel('tables/table3a.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'unweighted_unifrac')]
Out[11]:
In [12]:
if write_xls_files: bdiv_results[('moving-pictures', 'unweighted_unifrac')].to_excel('tables/table3b.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'unweighted_unifrac')]
Out[12]:
In [13]:
if write_xls_files: bdiv_results[('whole-body', 'unweighted_unifrac')].to_excel('tables/table3c.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'unweighted_unifrac')]
Out[13]:
In [14]:
if write_xls_files: bdiv_results[('88-soils', 'weighted_unifrac')].to_excel('tables/table3d.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('88-soils', 'weighted_unifrac')]
Out[14]:
In [15]:
if write_xls_files: bdiv_results[('moving-pictures', 'weighted_unifrac')].to_excel('tables/table3e.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('moving-pictures', 'weighted_unifrac')]
Out[15]:
In [16]:
if write_xls_files: bdiv_results[('whole-body', 'weighted_unifrac')].to_excel('tables/table3f.xlsx', na_rep='NA', float_format="%1.3f")
bdiv_results[('whole-body', 'weighted_unifrac')]
Out[16]:
In [17]:
taxa_corr_results = {}
for data_set_id in data_set_ids:
taxa_corr_results[data_set_id] = {}
for level in range(2,8):
df = pd.DataFrame.from_csv("%s-otus/taxa_correlations/level_%d.csv" % (data_set_id, level))
taxa_corr_results[data_set_id][level] = df.copy()
In [18]:
if write_xls_files: taxa_corr_results['88-soils'][2].to_excel('tables/table4a.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][2]
Out[18]:
In [19]:
if write_xls_files: taxa_corr_results['88-soils'][7].to_excel('tables/table4b.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['88-soils'][7]
Out[19]:
In [20]:
if write_xls_files: taxa_corr_results['moving-pictures'][2].to_excel('tables/table4c.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][2]
Out[20]:
In [21]:
if write_xls_files: taxa_corr_results['moving-pictures'][7].to_excel('tables/table4d.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['moving-pictures'][7]
Out[21]:
In [22]:
if write_xls_files: taxa_corr_results['whole-body'][2].to_excel('tables/table4e.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][2]
Out[22]:
In [23]:
if write_xls_files: taxa_corr_results['whole-body'][7].to_excel('tables/table4f.xlsx', na_rep='NA', float_format="%1.3f")
taxa_corr_results['whole-body'][7]
Out[23]:
In [24]:
from datetime import datetime
from calendar import month_abbr
month_lookup = {v: k for k,v in enumerate(month_abbr)}
In [25]:
log_fps = glob('*/*/log*txt')
results = []
for log_fp in log_fps:
fp_fields = log_fp.split('/')
data_set_id = fp_fields[0].strip('-otus')
run_id = fp_fields[1]
d = !grep "^Logging" $log_fp
start_fields = d[0].split()
start_hour, start_minute, start_second = map(int, start_fields[3].split(':'))
start_day, start_month, start_year = map(int, [start_fields[5], month_lookup[start_fields[6]], start_fields[7]])
end_fields = d[1].split()
end_hour, end_minute, end_second = map(int, end_fields[3].split(':'))
end_day, end_month, end_year = map(int, [end_fields[5], month_lookup[end_fields[6]], end_fields[7]])
start_datetime = datetime(start_year, start_month, start_day, start_hour, start_minute, start_second)
end_datetime = datetime(end_year, end_month, end_day, end_hour, end_minute, end_second)
runtime = (end_datetime - start_datetime).seconds
results.append((run_id, data_set_id, runtime))
df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] <= 10]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')
df = df.reindex_axis(order, axis=0)
In [26]:
if write_xls_files: df.to_excel('tables/table5.xlsx', na_rep='NA', float_format="%1.3f")
df
Out[26]:
In [27]:
df = pd.DataFrame.from_records(results, columns=['abbreviation','data set id', 'runtime (s)'])
df = df.merge(method_descriptions, left_on="abbreviation", right_index=True)
df = df[df['processors'] == 29]
df = df.pivot('abbreviation', 'data set id', 'runtime (s)')
In [28]:
if write_xls_files: df.to_excel('tables/table6.xlsx', na_rep='NA', float_format="%1.3f")
df
Out[28]:
In [29]:
df['moving-picture']['ucr_fast_O29_r82'] - df['moving-picture']['ucrss_fast_O29_r82']
Out[29]:
In [30]:
df['moving-picture']['ucr_fast_O29_r97'] - df['moving-picture']['ucrss_fast_O29_r97']
Out[30]:
In [31]:
novel_by_biome = pd.io.parsers.read_csv('raw-tables/fraction-novel-by-env-biome.tsv', sep='\t', index_col=0)
if write_xls_files: novel_by_biome.to_excel('tables/table7.xlsx', na_rep='NA', float_format="%1.3f")
novel_by_biome
Out[31]:
In [32]:
gs_fp_sets = [glob('%s-otus/*/group_significance*txt' % id_) for id_ in data_set_ids]
gs_results = {}
n = 25
for data_set_id, gs_fp_set in zip(data_set_ids, gs_fp_sets):
df = None
for fp in gs_fp_set:
run_id = fp.split('/')[1]
id_ = (data_set_id, run_id)
new_df = pd.DataFrame.from_csv(fp, sep='\t')
gs_results[id_] = new_df[['taxonomy', 'Test-Statistic']][:n]
In [33]:
if write_xls_files: gs_results[('88-soils', 'ucrss')][:10].to_excel('tables/table8a.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('88-soils', 'ucrss')][:10]
Out[33]:
In [34]:
gs_results[('88-soils', 'ucrss')]['taxonomy']['New.ReferenceOTU22']
Out[34]:
In [35]:
if write_xls_files: gs_results[('moving-pictures', 'ucrss')][:10].to_excel('tables/table8b.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('moving-pictures', 'ucrss')][:10]
Out[35]:
In [36]:
if write_xls_files: gs_results[('whole-body', 'ucrss')][:10].to_excel('tables/table8c.xlsx', na_rep='NA', float_format="%1.3f")
gs_results[('whole-body', 'ucrss')][:10]
Out[36]:
In [37]:
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU222']
Out[37]:
In [38]:
gs_results[('whole-body', 'ucrss')]['taxonomy']['New.CleanUp.ReferenceOTU17550']
Out[38]:
In [39]:
from pylab import savefig
emp_time_dir = 'emp-analysis/uc_time/'
emp_time_fps = glob('%s/*txt' % emp_time_dir)
mems = defaultdict(list)
times = defaultdict(list)
total_seqs = 5594412
for fp in emp_time_fps:
fn = splitext(split(fp)[1])[0]
_, subsample_size, iteration = fn.split('_')
mem, time = open(fp).read().strip().split()
subsample_seqs = float(subsample_size) * total_seqs
mems[subsample_seqs].append(float(mem) / 1000000)
times[subsample_seqs].append(float(time) / 60 / 60 / 24)
In [40]:
time_data = times.items()
time_data.sort()
time_data = time_data[9:]
num_seqs, time_distributions = zip(*time_data)
ax = boxplots(time_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])
slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in time_distributions])
label = ["Runtime (days) = \n (%1.2e x Sequence Count) + %1.2f" % (slope, intercept),
"\nr-squared: %1.2f\np=%1.2e" % (r_value, p_value)]
ax.text(0.15,0.70,"\n".join(label))
if write_xls_files: savefig('tables/Figure2.pdf')
In [41]:
def time_in_days(num_sequences):
return (0.02 * num_sequences + -21618.68) / 60 / 60 / 24
In [42]:
time_in_days(total_seqs)
Out[42]:
In [43]:
time_in_days(661202560)
Out[43]:
Confirming that the closed-reference step of ucr
and ucrss
have equal numbers of sequences failing to hit the reference.
$ wc -l *-otus/ucrss/step1_otus/*_failures.txt *-otus/ucr/uclust_ref_picked_otus/*_failures.txt
79662 88-soils-otus/ucrss/step1_otus/study_103_split_library_seqs_failures.txt
79662 88-soils-otus/ucr/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
2805981 moving-pictures-otus/ucrss/step1_otus/study_550_split_library_seqs_failures.txt
2805981 moving-pictures-otus/ucr/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
58736 whole-body-otus/ucrss/step1_otus/study_449_split_library_seqs_failures.txt
58736 whole-body-otus/ucr/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt
$ wc -l *-otus/ucrss_fast/step1_otus/*_failures.txt *-otus/ucr_fast/uclust_ref_picked_otus/*_failures.txt
84565 88-soils-otus/ucrss_fast/step1_otus/study_103_split_library_seqs_failures.txt
84565 88-soils-otus/ucr_fast/uclust_ref_picked_otus/study_103_split_library_seqs_failures.txt
3432893 moving-pictures-otus/ucrss_fast/step1_otus/study_550_split_library_seqs_failures.txt
3432893 moving-pictures-otus/ucr_fast/uclust_ref_picked_otus/study_550_split_library_seqs_failures.txt
73254 whole-body-otus/ucrss_fast/step1_otus/study_449_split_library_seqs_failures.txt
73254 whole-body-otus/ucr_fast/uclust_ref_picked_otus/study_449_split_library_seqs_failures.txt
Computing the number of sequences being clusstered in each data set.
count_seqs.py -i $MOVING_PICTURES_SEQS,$SOILS_SEQS,$WHOLE_BODY_SEQS
151387 : /home/caporaso/analysis/88-soils/study_103_split_library_seqs.fna (Sequence lengths (mean +/- std): 230.7818 +/- 11.4761)
792831 : /home/caporaso/analysis/whole-body/study_449_split_library_seqs.fna (Sequence lengths (mean +/- std): 228.5124 +/- 16.0318)
68666081 : /home/caporaso/analysis/moving-pictures/study_550_split_library_seqs.fna (Sequence lengths (mean +/- std): 123.2359 +/- 17.4283)
In [44]:
mem_data = mems.items()
mem_data.sort()
mem_data = mem_data[10:]
num_seqs, mem_distributions = zip(*mem_data)
ax = boxplots(mem_distributions, [e/1000000 for e in num_seqs], ['%1.1e' % e for e in num_seqs])
slope, intercept, r_value, p_value, std_err = stats.linregress(num_seqs, [np.median(x) for x in mem_distributions])
print "time = %1.2f * subsample_size + %1.2f" % (slope, intercept)
print "r-squared: %1.2f, p=%f" % (r_value, p_value)
In [44]: