Goal

  • Trying varying levels of bandwidth and DBL scaling with pre-fractionation abundances ('DBL-comm')

Init


In [115]:
import os
import glob
import re
import nestly

In [116]:
%load_ext rpy2.ipython
%load_ext pushnote


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The pushnote extension is already loaded. To reload it, use:
  %reload_ext pushnote

In [117]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)

BD min/max


In [118]:
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')


Min BD: 1.67323 
Max BD: 1.7744 

Nestly

  • assuming fragments already simulated

In [126]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'rep4_DBL-comm_bw')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde_parsed.pkl'
commFile = '/home/nick/notebook/SIPSim/dev/fullCyc/fullCyc_12C-Con_trm_comm.txt'

# emperical data for validation
emp_shan_file = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_shan.txt'
emp_BDspan_file = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_trm_BD-span.txt'
emp_corr_file = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_trm_corr.txt'

nreps = 4
nprocs = 10

In [127]:
# building tree structure
nest = nestly.Nest()

# varying params
nest.add('DBL_scaling', [0, 0.2, 0.4, 0.8])
nest.add('bandwidth', [0.06, 0.6])
nest.add('rep', [x + 1 for x in xrange(nreps)])


## set params
nest.add('abs', ['1e9'], create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [nprocs], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('commFile', [commFile], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')

In [128]:
%%writefile $bashFile
#!/bin/bash

export PATH={R_dir}:$PATH

echo '#-- SIPSim pipeline --#'

echo '# shuffling taxa in comm file'
comm_shuffle_taxa.r {commFile} > comm.txt

    
echo '# adding diffusion'    
SIPSim diffusion \
    {fragFile} \
    --bw {bandwidth} \
    --np {np} \
    > ampFrags_KDE_dif.pkl    

echo '# adding DBL contamination; abundance-weighted smearing'
SIPSim DBL \
    ampFrags_KDE_dif.pkl \
    --comm comm.txt \
    --commx {DBL_scaling} \
    --bw {bandwidth} \
    --np {np} \
    > ampFrags_KDE_dif_DBL.pkl
    
echo '# making incorp file'
SIPSim incorpConfigExample \
  --percTaxa {percTaxa} \
  --percIncorpUnif {percIncorp} \
  > {percTaxa}_{percIncorp}.config

echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    ampFrags_KDE_dif_DBL.pkl \
    {percTaxa}_{percIncorp}.config \
    --comm comm.txt \
    --bw {bandwidth} \
    --np {np} \
    > ampFrags_KDE_dif_DBL_inc.pkl

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    comm.txt \
    > fracs.txt 

echo '# simulating an OTU table'
SIPSim OTU_table \
    ampFrags_KDE_dif_DBL_inc.pkl \
    comm.txt \
    fracs.txt \
    --abs {abs} \
    --np {np} \
    > OTU_abs{abs}.txt
    
#-- w/ PCR simulation --#
echo '# simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt
    

#-- w/out PCR simulation --#    
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_sub.txt
        
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_sub.txt \
    > OTU_abs{abs}_sub_w.txt
    
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_sub.txt \
    > OTU_abs{abs}_sub_meta.txt    
    
    
    
#-- making summary tables --#
# PCR
shannon_calc.r OTU_abs1e9_PCR_sub.txt > OTU_abs1e9_PCR_sub_shan.txt
BD_span_calc.r OTU_abs1e9_PCR_sub.txt comm.txt > OTU_abs1e9_PCR_sub_BD-span.txt
correlogram_make.r OTU_abs1e9_PCR_sub.txt > OTU_abs1e9_PCR_sub_corr.txt    
# no PCR
shannon_calc.r OTU_abs1e9_sub.txt > OTU_abs1e9_sub_shan.txt
BD_span_calc.r OTU_abs1e9_sub.txt comm.txt > OTU_abs1e9_sub_BD-span.txt
correlogram_make.r OTU_abs1e9_sub.txt > OTU_abs1e9_sub_corr.txt


Writing /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep4_DBL-comm_bw/SIPSimRun.sh

In [ ]:
!chmod 777 $bashFile
!cd $workDir; \
    nestrun  --template-file $bashFile -d rep4_DBL-comm_bw --log-file log.txt -j 1


2016-03-09 14:15:16,043 * INFO * Template: ./SIPSimRun.sh
2016-03-09 14:15:16,045 * INFO * [6925] Started ./SIPSimRun.sh in rep4_DBL-comm_bw/0.4/0.06/3

In [ ]:
%pushnote rep4_DBL-comm_bw complete

Comparing to emperical data

  • correlation/regression analyses of metrics on community composition

In [145]:
%%R

# function for loading dataset files
load.data.files = function(sim.files, emp.file){
    # loading
    ## simulations
    df = list()
    for(x in sim.files){
        # simulation
        tmp = read.delim(x, sep='\t')
        xx = strsplit(x, '/')[[1]]
        tmp$DBL_scale = xx[10] %>% as.numeric
        tmp$bw = xx[11] %>% as.numeric
        tmp$SIM_rep = xx[12] %>% as.numeric  
        tmp$dataset = 'Simulation'       
        df[[x]] = tmp 
        
        # emperical (matched for each simulation)
        if(xx[12] %>% as.numeric == 1){
            tmp = read.delim(emp.file, sep='\t')
            tmp$DBL_scale = xx[10] %>% as.numeric
            tmp$bw = xx[11] %>% as.numeric
            tmp$SIM_rep = 1
            tmp$dataset = 'Emperical'        
            xy = paste0(x, '_EMP')
            df[[xy]] = tmp
        }
    }
    df = do.call(rbind, df) %>% as.data.frame 
    rownames(df) = 1:nrow(df)

    # return
    return(df)
    }

Shannon index


In [146]:
sim_shan_files = !find $buildDir -name "OTU_abs1e9_PCR_sub_shan.txt"
print len(sim_shan_files)
print emp_shan_file


32
/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_shan.txt

In [147]:
%%R -i sim_shan_files -i emp_shan_file

df.shan = load.data.files(sim_shan_files, emp_shan_file) 
df.shan %>% tail(n=3)


     library      sample                            OTU Buoyant_density
1928       1 1.765-1.768 Acaryochloris_marina_MBIC11017           1.766
1929       1 1.768-1.771 Acaryochloris_marina_MBIC11017           1.769
1930       1 1.771-1.775 Acaryochloris_marina_MBIC11017           1.773
      shannon DBL_scale  bw SIM_rep    dataset
1928 4.982089       0.8 0.6       4 Simulation
1929 5.027796       0.8 0.6       4 Simulation
1930 5.019206       0.8 0.6       4 Simulation

In [148]:
%%R -w 800 -h 600
# summarizing
df.shan.s = df.shan %>%
    group_by(dataset, bw, DBL_scale, BD_bin = ntile(Buoyant_density, 24)) %>%
    summarize(mean_shannon = mean(shannon), 
              sd_shannon = sd(shannon), 
              mean_BD = mean(Buoyant_density))

ggplot(df.shan.s, aes(mean_BD, mean_shannon, color=dataset,
                      ymin=mean_shannon-sd_shannon, ymax=mean_shannon+sd_shannon)) +
    geom_pointrange() +
    facet_grid(DBL_scale ~ bw) +
    labs(x='Buoyant density (binned; 24 bins)', y='Shannon index') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [172]:
%%R -w 550 -h 600
# pairwise correlations for each dataset
df.shan.bin = df.shan %>%
    group_by(BD_bin = ntile(Buoyant_density, 24))

#calc.spearman = function(x){
#    cor(x[,'shannon.x'], x['shannon.y'], method='spearman')[1,1]
#}

calc.pearson = function(x){
    cor(x[,'shannon.x'], x['shannon.y'], method='pearson')[1,1]
}

df.shan.corr = inner_join(df.shan.bin, df.shan.bin, c('BD_bin' = 'BD_bin',
                                                      'bw' = 'bw',
                                                      'DBL_scale' = 'DBL_scale')) %>%
    group_by(bw, DBL_scale, dataset.x, dataset.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, calc.pearson)) %>%
    unnest(pearson = model %>% purrr::map(function(x) x)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(pearson_txt = round(pearson, 2))

        
# plotting
ggplot(df.shan.corr, aes(dataset.x, dataset.y, fill=pearson)) +
    geom_tile() +
    geom_text(aes(label=pearson_txt), color='white', size=6) +
    scale_fill_gradient(low='black', high='red') +
    labs(title='Shannon index') +
    facet_grid(DBL_scale ~ bw) +        
    theme(
        text = element_text(size=16)
    )


BD spans


In [150]:
sim_BDspan_files = !find $buildDir -name "OTU_abs1e9_PCR_sub_BD-span.txt"
print len(sim_BDspan_files)
print emp_BDspan_file


32
/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_trm_BD-span.txt

In [151]:
%%R
df.BDspan = load.data.files(sim_BDspan_files, emp_BDspan_file) 
df.BDspan %>% head


                                   OTU library mean_preFrac_abund min_BD max_BD
1       Acaryochloris_marina_MBIC11017       1       0.0002621200  1.659  1.781
2       Acetobacterium_woodii_DSM_1030       1       0.0012047869  1.659  1.781
3 Acetobacter_pasteurianus_IFO_3283-03       1       0.0002955524  1.659  1.781
4    Acetohalobium_arabaticum_DSM_5501       1       0.0003462713  1.659  1.781
5         Acholeplasma_laidlawii_PG-8A       1       0.0009053839  1.659  1.781
6             Acholeplasma_palmae_J233       1       0.0003748588  1.659  1.781
  BD_range BD_range_perc DBL_scale   bw SIM_rep    dataset
1    0.122           100       0.4 0.06       3 Simulation
2    0.122           100       0.4 0.06       3 Simulation
3    0.122           100       0.4 0.06       3 Simulation
4    0.122           100       0.4 0.06       3 Simulation
5    0.122           100       0.4 0.06       3 Simulation
6    0.122           100       0.4 0.06       3 Simulation

In [152]:
%%R -w 700 -h 600

# plotting
ggplot(df.BDspan, aes(mean_preFrac_abund, BD_range_perc, fill=dataset)) +
    geom_hex(alpha=0.5) +
    scale_x_log10() +
    facet_grid(DBL_scale ~ bw) +
    labs(x='Pre-fractionation abundance', y='BD span') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [153]:
%%R -i sim_BDspan_files -i emp_BDspan_file

# binning by pre-fractionation abundances
n.tile = 20
df.BDspan = df.BDspan %>%
    group_by(dataset, library, DBL_scale, bw, preFrac_abund_bin = ntile(mean_preFrac_abund, n.tile)) %>%
    summarize(mean_preFrac_abund = mean(mean_preFrac_abund),
              var_BD_range = var(BD_range),
              sd_BD_range = sd(BD_range))

df.BDspan %>% tail(n=3)


Source: local data frame [3 x 8]
Groups: dataset, library, DBL_scale, bw [1]

     dataset library DBL_scale    bw preFrac_abund_bin mean_preFrac_abund
       (chr)   (int)     (dbl) (dbl)             (int)              (dbl)
1 Simulation       1       0.8   0.6                18        0.001634448
2 Simulation       1       0.8   0.6                19        0.002465021
3 Simulation       1       0.8   0.6                20        0.009600700
Variables not shown: var_BD_range (dbl), sd_BD_range (dbl)

In [154]:
%%R -w 550 -h 600
calc.spearman = function(x){
    cor(x[,'var_BD_range.x'], x['var_BD_range.y'], method='spearman')[1,1]
}

df.BDspan.corr = inner_join(df.BDspan, df.BDspan, c('preFrac_abund_bin' = 'preFrac_abund_bin',
                                                    'DBL_scale' = 'DBL_scale',
                                                    'bw' = 'bw')) %>%
    group_by(DBL_scale, bw, dataset.x, dataset.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, calc.spearman)) %>%
    unnest(spearman = model %>% purrr::map(function(x) x)) %>%
    ungroup() %>%
    select(-data, -model)  %>%
    mutate(spearman_txt = round(spearman, 2))


# plotting
ggplot(df.BDspan.corr, aes(dataset.x, dataset.y, fill=spearman)) +
    geom_tile() +
    geom_text(aes(label=spearman_txt), color='white', size=6) +
    scale_fill_gradient(low='black', high='red') +
    labs(title='BD span') +
    facet_grid(DBL_scale ~ bw) +
    theme(
        text = element_text(size=16)
    )


correlograms (jaccard ~ BD)


In [155]:
sim_corr_files = !find $buildDir -name "OTU_abs1e9_PCR_sub_corr.txt"
print len(sim_corr_files)
print emp_corr_file


32
/home/nick/notebook/SIPSim/dev/fullCyc_trim/SIP-core_unk_trm_corr.txt

In [156]:
%%R -i sim_corr_files -i emp_corr_file

df.corr = load.data.files(sim_corr_files, emp_corr_file) 

# binning
df.corr = df.corr %>%
    filter(!is.na(Mantel.corr)) %>%
    group_by(DBL_scale, bw, dataset, library, class.index.bin = ntile(class.index, 12)) 

df.corr %>% tail(n=3) %>% as.data.frame


  library class.index n.dist Mantel.corr    Pr Pr.corr DBL_scale  bw SIM_rep
1       1  0.03860417     28  -0.1716968 0.001   0.010       0.8 0.6       4
2       1  0.04256250     28  -0.1246232 0.006   0.030       0.8 0.6       4
3       1  0.04652083     28  -0.1256943 0.002   0.012       0.8 0.6       4
     dataset class.index.bin
1 Simulation              10
2 Simulation              11
3 Simulation              12

In [157]:
%%R -w 700 -h 600
# plotting
df.corr.s = df.corr %>%
    group_by(DBL_scale, bw, dataset, class.index.bin) %>%
    summarize(mean_Mantel.corr = mean(Mantel.corr),
              sd_Mantel.corr = sd(Mantel.corr), 
              mean_class.index = mean(class.index))

ggplot(df.corr.s, aes(mean_class.index, mean_Mantel.corr, color=dataset,
                     ymin=mean_Mantel.corr-sd_Mantel.corr,
                     ymax=mean_Mantel.corr+sd_Mantel.corr)) +
    geom_pointrange() +
    labs(x='Class index (binned; 12 bins)', y='Mantel correlation coef.') +
    facet_grid(DBL_scale ~ bw) + 
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [173]:
%%R -w 550 -h 600
# pairwise correlations for each dataset
df.shan.bin = df.shan %>%
    group_by(BD_bin = ntile(Buoyant_density, 24))

calc.pearson = function(x){
    cor(x[,'Mantel.corr.x'], x['Mantel.corr.y'], method='pearson')[1,1]
}

df.corr.lm = inner_join(df.corr, df.corr, c('class.index.bin' = 'class.index.bin',
                                                      'bw' = 'bw',
                                                      'DBL_scale' = 'DBL_scale')) %>%
    group_by(bw, DBL_scale, dataset.x, dataset.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, calc.pearson)) %>%
    unnest(pearson = model %>% purrr::map(function(x) x)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(pearson_txt = round(pearson, 2))

        
# plotting
ggplot(df.corr.lm, aes(dataset.x, dataset.y, fill=pearson)) +
    geom_tile() +
    geom_text(aes(label=pearson_txt), color='white', size=6) +
    scale_fill_gradient(low='black', high='red') +
    labs(title='Beta diversity correlogram') +
    facet_grid(DBL_scale ~ bw) +        
    theme(
        text = element_text(size=16)
    )



In [171]:
%%R -w 550 -h 600

# df.corr.lm = inner_join(df.corr, df.corr, c('class.index.bin' = 'class.index.bin',
#                                             'DBL_scale' = 'DBL_scale',
#                                             'bw' = 'bw')) %>%
#     group_by(DBL_scale, bw, dataset.x, dataset.y) %>%
#     nest() %>%
#     mutate(model = purrr::map(data, ~lm(Mantel.corr.y ~ Mantel.corr.x, data=.))) %>%
#     unnest(model %>% purrr::map(broom::glance)) %>%
#     ungroup() %>%
#     select(-data, -model) %>%
#     mutate(r.squared_txt = round(r.squared, 2))
        
        
# # table
# #df.corr.lm %>% dplyr::select(dataset.x, dataset.y, r.squared) %>% print
        
# # plotting
# ggplot(df.corr.lm, aes(dataset.x, dataset.y, fill=r.squared)) +
#     geom_tile() +
#     geom_text(aes(label=r.squared_txt), color='white', size=6) +
#     scale_fill_gradient(low='black', high='red') +
#     labs(title='Beta diversity correlogram') +
#     facet_grid(DBL_scale ~ bw) +
#     theme(
#         text = element_text(size=16)
#     )


NULL

Notes:

  • Simulation parameters that best fit the emperical data:
    • DBL_scale = 0.2
    • bw = 0.6
    • PCR simulation included

~OLD~



Loading non-PCR subsampled OTU tables


In [327]:
OTU_files = !find $buildDir -name "OTU_abs1e9_sub.txt"
OTU_files


Out[327]:
['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/1/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/3/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/2/OTU_abs1e9_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/1/OTU_abs1e9_sub.txt']

In [332]:
%%R -i OTU_files
# loading files

df.SIM = list()
for (x in OTU_files){
    df = read.delim(x, sep='\t')
    xx = strsplit(x, '/')[[1]]
    df$DBL_scale = xx[10] %>% as.numeric
    df$bw = xx[11] %>% as.numeric
    df$SIM_rep = xx[12] %>% as.numeric
    df.SIM[[x]] = df
    }
df.SIM = do.call('rbind', df.SIM)
df.SIM$file = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head(n=3)


  library    fraction                          taxon BD_min BD_mid BD_max count
1       1  -inf-1.660 Acaryochloris_marina_MBIC11017   -Inf  1.659  1.659     1
2       1 1.660-1.662 Acaryochloris_marina_MBIC11017  1.660  1.661  1.662     1
3       1 1.662-1.666 Acaryochloris_marina_MBIC11017  1.662  1.664  1.666     0
     rel_abund DBL_scale   bw SIM_rep
1 6.459531e-05       0.4 0.06       3
2 9.435743e-05       0.4 0.06       3
3 0.000000e+00       0.4 0.06       3
                                                                                                            file
1 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt
2 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt
3 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt

In [ ]:


In [ ]:

BD range where an OTU is detected

  • Do the simulated OTU BD distributions span the same BD range of the emperical data?

In [334]:
comm_files = !find $buildDir -name "comm.txt"
comm_files


Out[334]:
['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.6/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.06/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0/0.6/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.06/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.2/0.6/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.06/1/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/3/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/2/comm.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.8/0.6/1/comm.txt']

In [335]:
%%R -i comm_files

df.SIM.comm = list()
for (x in comm_files){
    df = read.delim(x, sep='\t')
    xx = strsplit(x, '/')[[1]]
    df$DBL_scale = xx[10] %>% as.numeric
    df$bw = xx[11] %>% as.numeric
    df$SIM_rep = xx[12] %>% as.numeric
    df.SIM.comm[[x]] = df
    }

df.SIM.comm = do.call(rbind, df.SIM.comm)
rownames(df.SIM.comm) = 1:nrow(df.SIM.comm)
df.SIM.comm = df.SIM.comm %>%
    rename('bulk_abund' = rel_abund_perc) %>%
    mutate(bulk_abund = bulk_abund / 100)
df.SIM.comm %>% head(n=3)


  library                        taxon_name bulk_abund rank DBL_scale   bw
1       1 Parvibaculum_lavamentivorans_DS-1 0.05728702    1       0.4 0.06
2       1     Chlamydophila_pneumoniae_J138 0.04295700    2       0.4 0.06
3       1   Vibrio_campbellii_ATCC_BAA-1116 0.02965853    3       0.4 0.06
  SIM_rep
1       3
2       3
3       3

In [344]:
%%R
## joining SIP & comm (pre-fractionation)
df.SIM.j = inner_join(df.SIM, df.SIM.comm, c('library' = 'library', 
                                             'taxon' = 'taxon_name',
                                             'DBL_scale' = 'DBL_scale',
                                             'bw' = 'bw',
                                             'SIM_rep' = 'SIM_rep')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)


  library    fraction                          taxon BD_min BD_mid BD_max count
1       1 1.671-1.677 Acaryochloris_marina_MBIC11017  1.671  1.674  1.677     5
2       1 1.677-1.681 Acaryochloris_marina_MBIC11017  1.677  1.679  1.681     3
3       1 1.681-1.683 Acaryochloris_marina_MBIC11017  1.681  1.682  1.683     1
     rel_abund DBL_scale   bw SIM_rep
1 3.042843e-04       0.4 0.06       3
2 2.246686e-04       0.4 0.06       3
3 6.079397e-05       0.4 0.06       3
                                                                                                            file
1 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt
2 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt
3 /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3_DBL-comm_bw/0.4/0.06/3/OTU_abs1e9_sub.txt
    bulk_abund rank
1 0.0002462213  771
2 0.0002462213  771
3 0.0002462213  771

In [345]:
%%R
# calculating BD range
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(bw, DBL_scale, SIM_rep) %>%
    mutate(max_BD_range = max(BD_mid) - min(BD_mid)) %>%
    ungroup() %>%
    group_by(bw, DBL_scale, SIM_rep, taxon) %>%
    summarize(mean_bulk_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / first(max_BD_range) * 100) %>%
    ungroup() %>%
    mutate(SIM_rep = SIM_rep %>% as.character)
    
df.SIM.j.f %>% head(n=3) %>% as.data.frame


    bw DBL_scale SIM_rep                                taxon mean_bulk_abund
1 0.06         0       1       Acaryochloris_marina_MBIC11017    0.0003366196
2 0.06         0       1       Acetobacterium_woodii_DSM_1030    0.0009053839
3 0.06         0       1 Acetobacter_pasteurianus_IFO_3283-03    0.0002462213
  min_BD max_BD BD_range BD_range_perc
1  1.676  1.773    0.097           100
2  1.676  1.773    0.097           100
3  1.676  1.773    0.097           100

In [353]:
%%R -h 750 -w 800

## plotting
ggplot(df.SIM.j.f, aes(mean_bulk_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10(limits=c(0.0001, 0.1)) +
    scale_y_continuous() +
    facet_grid(DBL_scale ~ bw) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    #geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )



In [462]:
%%R -h 750 -w 800

## plotting
ggplot(df.SIM.j.f, aes(mean_bulk_abund, BD_range_perc)) +
    geom_hex() +
    scale_x_log10(limits=c(0.0001, 0.1)) +
    scale_y_continuous(limits=c(0,105)) +
    scale_fill_gradient(low='grey95', high='black') +
    facet_grid(DBL_scale ~ bw) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    #geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )


Assessing diversity

Asigning zeros


In [355]:
%%R
# giving value to missing abundances
min.pos.val = df.SIM.j %>%
    filter(rel_abund > 0) %>%
    group_by() %>%
    mutate(min_abund = min(rel_abund)) %>%
    ungroup() %>%
    filter(rel_abund == min_abund)

min.pos.val = min.pos.val[1,'rel_abund'] %>% as.numeric
imp.val = min.pos.val / 10


# convert numbers
df.SIM.j[df.SIM.j$rel_abund == 0, 'abundance'] = imp.val

# another closure operation
df.SIM.j = df.SIM.j %>%
    group_by(bw, DBL_scale, SIM_rep, fraction) %>%
    mutate(rel_abund = rel_abund / sum(rel_abund))


# status
cat('Below detection level abundances converted to: ', imp.val, '\n')


Below detection level abundances converted to:  3.338341e-06 

Plotting Shannon diversity for each


In [356]:
%%R
shannon_index_long = function(df, abundance_col, ...){
    # calculating shannon diversity index from a 'long' formated table
    ## community_col = name of column defining communities
    ## abundance_col = name of column defining taxon abundances
    df = df %>% as.data.frame
    cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
    df.s = df %>%
        group_by_(...) %>%
        mutate_(REL_abundance = cmd) %>%
        mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
               shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
        ungroup() %>% 
        dplyr::select(-REL_abundance, -pi__ln_pi) %>%
        distinct_(...) 
    return(df.s)
}

In [358]:
%%R -w 800 -h 700
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM.j, 'count', 'library', 'fraction', 'bw', 'DBL_scale') %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD) 

df.SIM.shan.s = df.SIM.shan %>%
    group_by(bw, DBL_scale, BD_bin = ntile(BD_mid, 24)) %>%
    summarize(mean_BD = mean(BD_mid),
              mean_shannon = mean(shannon),
              sd_shannon = sd(shannon))

# plotting
p = ggplot(df.SIM.shan.s, aes(mean_BD, mean_shannon, 
                             ymin=mean_shannon-sd_shannon,
                             ymax=mean_shannon+sd_shannon)) +
    geom_pointrange() +
    labs(x='Buoyant density', y='Shannon index') +
    facet_grid(DBL_scale ~ bw) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p


Plotting variance


In [367]:
%%R -w 850 -h 700
df.SIM.j.var = df.SIM.j %>%
    group_by(bw, DBL_scale, SIM_rep, fraction) %>%
    mutate(variance = var(rel_abund)) %>%
    ungroup() %>%
    distinct(bw, DBL_scale, SIM_rep, fraction) %>%
    select(bw, DBL_scale, SIM_rep, fraction, variance, BD_mid) %>%
    mutate(SIM_rep = SIM_rep %>% as.character)

ggplot(df.SIM.j.var, aes(BD_mid, variance, color=SIM_rep)) +
    geom_point() +
    geom_line() +
    facet_wrap(DBL_scale ~ bw, scales='free_y', ncol=2) +    
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [368]:
%%R -w 800 -h 600

ggplot(df.SIM.j.var %>% filter(BD_mid >= 1.75), aes(BD_mid, variance, color=SIM_rep)) +
    geom_point() +
    geom_line() +
    facet_grid(DBL_scale ~ bw) +    
    theme_bw() +
    theme(
        text = element_text(size=16)
    )


Notes

  • increased DBL_scaling is increasing the variance at the heavy tail

Correlograms


In [451]:
%%R

BD.diffs = function(df){
    BDs = df$BD_mid %>% unique
    df.BD = expand.grid(BDs, BDs)
    df.BD$diff = df.BD %>% apply(1, diff) %>% abs %>% as.vector    
    df.BD = df.BD %>%
        spread(Var1, diff) 
    rownames(df.BD) = df.BD$Var2
    df.BD$