Goal

Accuracy as a function of isotope incorporation & community similarity among replicate microcosms

Note: just adding more levels to the original microBetaDiv simulation

Variable parameters:

  • Shared % of taxa
    • 80, 85, 90, 95, 100
  • Permuted % of rank abundances
    • 50, 40, 30
  • Atom % isotope incorporation
    • 100
  • % taxa that incorporate
    • 10
  • N-reps (stocastic: taxon abundances & which incorporate)
    • 10

Setting paths


In [1]:
import os

# paths
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1147/'
buildDir = os.path.join(workDir, 'microBetaDiv_morePerm')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde.pkl'
genome_index = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_index.txt'

Init


In [2]:
import glob
import itertools
import nestly

In [3]:
%load_ext rpy2.ipython
%load_ext pushnote

In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Need help getting started? Try the cookbook for R:
http://www.cookbook-r.com/Graphs/

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [5]:
if not os.path.isdir(buildDir):
    os.makedirs(buildDir)
%cd $buildDir


/home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm

BD min/max


In [6]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

print 'Min BD: {}'.format(min_BD)
print 'Max BD: {}'.format(max_BD)


Min BD: 1.67323
Max BD: 1.7744

Nestly

  • assuming fragments already simulated

In [7]:
# making an experimental design file for qSIP
x = range(1,7)
y = ['control', 'treatment']

expDesignFile = os.path.join(buildDir, 'qSIP_exp_design.txt')
with open(expDesignFile, 'wb') as outFH:
    for i,z in itertools.izip(x,itertools.cycle(y)):
        line = '\t'.join([str(i),z])
        outFH.write(line + '\n')

!head $expDesignFile


1	control
2	treatment
3	control
4	treatment
5	control
6	treatment

Nestly params


In [10]:
# building tree structure
nest = nestly.Nest()

# varying params: test
#nest.add('shared_perc', [90, 95])
#nest.add('perm_perc', [5, 10])
#nest.add('rep', range(1,4))

# varying params
nest.add('shared_perc', [80, 85, 90, 95, 100])
nest.add('perm_perc', [30, 40, 50])
nest.add('rep', range(1,11))

## set params
nest.add('percIncorp', [100], create_dir=False)
nest.add('percTaxa', [10], create_dir=False)

nest.add('abs', ['1e9'], create_dir=False)
nest.add('np', [8], create_dir=False)
nest.add('Monte_rep', [100000], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
nest.add('min_BD', [min_BD], create_dir=False)
nest.add('max_BD', [max_BD], create_dir=False)
nest.add('DBL_scaling', [0.5], create_dir=False)
nest.add('bandwidth', [0.8], create_dir=False)
nest.add('heavy_BD_min', [1.71], create_dir=False)
nest.add('heavy_BD_max', [1.75], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('genome_index', [genome_index], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('exp_design', [expDesignFile], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')

Experimental design


In [11]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_expDesign.sh'
bashFileTmp


Out[11]:
'/home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/SIPSimRun_expDesign.sh'

In [12]:
%%writefile $bashFileTmp
#!/bin/bash

echo '#-- Experimental design --#'

echo '# Making an isotope incorporation config file'
echo '## 3 replicate gradients for control & treatment'
SIPSim incorpConfigExample \
  --percIncorpUnif {percIncorp} \
  --n_reps 3 \
  > incorp.config

echo '# Selecting incorporator taxa'
echo '## This is to make the gradient replicates consistent (qSIP finds mean among replicates)'
SIPSim KDE_selectTaxa \
    -p {percTaxa} \
    {fragFile} \
    > incorporators.txt

echo '# Creating a community file (3 replicate control, 3 replicate treatment)'
SIPSim communities \
    --richness 950 \
    --config incorp.config \
    --shared_perc {shared_perc} \
    --perm_perc {perm_perc} \
    {genome_index} \
    > comm.txt    

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    --BD_min {min_BD} \
    --BD_max {max_BD} \
    comm.txt \
    > fracs.txt


Writing /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/SIPSimRun_expDesign.sh

In [13]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file exp_design.log -j 10


2016-05-02 13:29:24,453 * INFO * Template: ./SIPSimRun_expDesign.sh
2016-05-02 13:29:24,455 * INFO * [138568] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/7
2016-05-02 13:29:24,457 * INFO * [138569] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/9
2016-05-02 13:29:24,458 * INFO * [138571] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/8
2016-05-02 13:29:24,463 * INFO * [138573] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/3
2016-05-02 13:29:24,465 * INFO * [138575] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/10
2016-05-02 13:29:24,471 * INFO * [138577] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/2
2016-05-02 13:29:24,475 * INFO * [138579] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/1
2016-05-02 13:29:24,479 * INFO * [138581] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/4
2016-05-02 13:29:24,487 * INFO * [138583] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/5
2016-05-02 13:29:24,495 * INFO * [138585] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/6
2016-05-02 13:29:33,709 * INFO * [138571] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/8 Finished with 0
2016-05-02 13:29:33,711 * INFO * [139720] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/7
2016-05-02 13:29:33,797 * INFO * [138573] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/3 Finished with 0
2016-05-02 13:29:33,799 * INFO * [139722] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/9
2016-05-02 13:29:33,816 * INFO * [138583] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/5 Finished with 0
2016-05-02 13:29:33,819 * INFO * [139724] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/8
2016-05-02 13:29:33,921 * INFO * [138577] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/2 Finished with 0
2016-05-02 13:29:33,923 * INFO * [139726] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/3
2016-05-02 13:29:34,015 * INFO * [138581] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/4 Finished with 0
2016-05-02 13:29:34,019 * INFO * [139730] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/10
2016-05-02 13:29:34,054 * INFO * [138575] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/10 Finished with 0
2016-05-02 13:29:34,056 * INFO * [139732] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/2
2016-05-02 13:29:34,098 * INFO * [138585] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/6 Finished with 0
2016-05-02 13:29:34,104 * INFO * [139734] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/1
2016-05-02 13:29:34,105 * INFO * [138569] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/9 Finished with 0
2016-05-02 13:29:34,107 * INFO * [139736] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/4
2016-05-02 13:29:34,180 * INFO * [138579] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/1 Finished with 0
2016-05-02 13:29:34,182 * INFO * [139749] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/5
2016-05-02 13:29:34,353 * INFO * [138568] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/7 Finished with 0
2016-05-02 13:29:34,359 * INFO * [139773] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/6
2016-05-02 13:29:41,209 * INFO * [139722] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/9 Finished with 0
2016-05-02 13:29:41,211 * INFO * [140746] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/7
2016-05-02 13:29:41,554 * INFO * [139730] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/10 Finished with 0
2016-05-02 13:29:41,556 * INFO * [140809] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/9
2016-05-02 13:29:41,742 * INFO * [139773] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/6 Finished with 0
2016-05-02 13:29:41,744 * INFO * [140838] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/8
2016-05-02 13:29:41,889 * INFO * [139726] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/3 Finished with 0
2016-05-02 13:29:41,891 * INFO * [140866] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/3
2016-05-02 13:29:41,997 * INFO * [139724] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/8 Finished with 0
2016-05-02 13:29:41,999 * INFO * [140884] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/10
2016-05-02 13:29:42,134 * INFO * [139720] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/7 Finished with 0
2016-05-02 13:29:42,136 * INFO * [140899] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/2
2016-05-02 13:29:42,171 * INFO * [139732] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/2 Finished with 0
2016-05-02 13:29:42,173 * INFO * [140901] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/1
2016-05-02 13:29:42,236 * INFO * [139736] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/4 Finished with 0
2016-05-02 13:29:42,238 * INFO * [140916] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/4
2016-05-02 13:29:42,247 * INFO * [139734] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/1 Finished with 0
2016-05-02 13:29:42,249 * INFO * [140918] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/5
2016-05-02 13:29:42,642 * INFO * [139749] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/50/5 Finished with 0
2016-05-02 13:29:42,644 * INFO * [140994] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/6
2016-05-02 13:29:48,840 * INFO * [140809] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/9 Finished with 0
2016-05-02 13:29:48,842 * INFO * [141887] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/7
2016-05-02 13:29:48,991 * INFO * [140746] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/7 Finished with 0
2016-05-02 13:29:48,993 * INFO * [141946] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/9
2016-05-02 13:29:49,287 * INFO * [140916] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/4 Finished with 0
2016-05-02 13:29:49,289 * INFO * [141981] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/8
2016-05-02 13:29:49,341 * INFO * [140918] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/5 Finished with 0
2016-05-02 13:29:49,343 * INFO * [141986] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/3
2016-05-02 13:29:49,403 * INFO * [140866] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/3 Finished with 0
2016-05-02 13:29:49,405 * INFO * [142011] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/10
2016-05-02 13:29:49,538 * INFO * [140899] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/2 Finished with 0
2016-05-02 13:29:49,540 * INFO * [142028] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/2
2016-05-02 13:29:49,787 * INFO * [140901] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/1 Finished with 0
2016-05-02 13:29:49,789 * INFO * [142068] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/1
2016-05-02 13:29:49,803 * INFO * [140838] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/8 Finished with 0
2016-05-02 13:29:49,805 * INFO * [142072] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/4
2016-05-02 13:29:49,909 * INFO * [140884] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/40/10 Finished with 0
2016-05-02 13:29:49,911 * INFO * [142086] Started ./SIPSimRun_expDesign.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/95/30/5
**OUTPUT MUTED**

In [14]:
%pushnote exp_design complete: $buildDir

SIPSim pipeline


In [15]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-pipeline.sh'
bashFileTmp


Out[15]:
'/home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/SIPSimRun_SIPSim-pipeline.sh'

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

echo '#-- SIPSim pipeline --#'    
    
echo '# Adding diffusion'    
SIPSim diffusion \
    -n {Monte_rep} \
    --bw {bandwidth} \
    --np {np} \
    {fragFile} \
    > ampFrags_KDE_dif.pkl    

echo '# Adding DBL contamination; abundance-weighted smearing'
SIPSim DBL \
    -n {Monte_rep} \
    --comm comm.txt \
    --commx {DBL_scaling} \
    --np {np} \
    ampFrags_KDE_dif.pkl \
    > ampFrags_KDE_dif_DBL.pkl 

echo '# Adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    -n {Monte_rep} \
    --comm comm.txt \
    --taxa incorporators.txt \
    --np {np} \
    ampFrags_KDE_dif_DBL.pkl \
    incorp.config \
    > ampFrags_KDE_dif_DBL_inc.pkl

echo '# Simulating an OTU table'
SIPSim OTU_table \
    --abs {abs} \
    --np {np} \
    ampFrags_KDE_dif_DBL_inc.pkl \
    comm.txt \
    fracs.txt \
    > OTU_abs{abs}.txt
    
echo '# Simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# Subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# Making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# Making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt
       

#-- removing large intermediate files --#
rm -f ampFrags_KDE_dif.pkl
rm -f ampFrags_KDE_dif_DBL.pkl
rm -f ampFrags_KDE_dif_DBL_inc.pkl


Writing /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/SIPSimRun_SIPSim-pipeline.sh

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file SIPSim_pipeline.log -j 2


2016-05-02 13:31:21,097 * INFO * Template: ./SIPSimRun_SIPSim-pipeline.sh
2016-05-02 13:31:21,099 * INFO * [155808] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/7
2016-05-02 13:31:21,101 * INFO * [155809] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/9
2016-05-02 14:14:19,194 * INFO * [155808] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/7 Finished with 0
2016-05-02 14:14:19,243 * INFO * [164247] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/8
2016-05-02 14:14:25,173 * INFO * [155809] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/9 Finished with 0
2016-05-02 14:14:25,176 * INFO * [164347] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/3
2016-05-02 14:52:37,762 * INFO * [164347] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/3 Finished with 0
2016-05-02 14:52:37,790 * INFO * [171645] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/10
2016-05-02 14:53:36,256 * INFO * [164247] /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/8 Finished with 0
2016-05-02 14:53:36,288 * INFO * [171891] Started ./SIPSimRun_SIPSim-pipeline.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/microBetaDiv_morePerm/80/30/2

In [ ]:
%pushnote SIPSim pipeline complete: $buildDir

Summary of simulated data


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-summary.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
   
# plotting 'raw' taxon abundances
SIPSimR OTU_taxonAbund \
    OTU_abs{abs}.txt \
    -r {topTaxaToPlot} \
    -o OTU_abs{abs}

# plotting 'sequenced' taxon abundances
SIPSimR OTU_taxonAbund \
    OTU_abs{abs}_PCR_sub.txt \
    -r {topTaxaToPlot} \
    -o OTU_abs{abs}_PCR_sub

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file SIPSim_summary.log -j 10

HR-SIP: DESeq2


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_HRSIP.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

# phyloseq
## making phyloseq object from OTU table
SIPSimR phyloseq_make \
    OTU_abs{abs}_PCR_sub_w.txt \
    -s OTU_abs{abs}_PCR_sub_meta.txt \
    > OTU_abs{abs}_PCR_sub.physeq

## filtering phyloseq object to just 'heavy' fractions
SIPSimR phyloseq_edit \
    OTU_abs{abs}_PCR_sub.physeq \
    --BD_min {heavy_BD_min} \
    --BD_max {heavy_BD_max} \
    > OTU_abs{abs}_PCR_sub_filt.physeq

## making ordination
SIPSimR phyloseq_ordination \
    OTU_abs{abs}_PCR_sub_filt.physeq \
    OTU_abs{abs}_PCR_sub_filt_bray-NMDS.pdf

# DESeq2
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    OTU_abs{abs}_PCR_sub_filt.physeq \
    > OTU_abs{abs}_PCR_sub_filt_DESeq2

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file HR-SIP.log -j 10

In [ ]:
%pushnote HR-SIP complete: $buildDir

qSIP


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_qSIP.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash


# qSIP
SIPSim qSIP \
    OTU_abs{abs}.txt \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_qSIP.txt
        

# qSIP: atom excess
SIPSim qSIP_atomExcess \
    --np {np} \
    OTU_abs{abs}_PCR_sub_qSIP.txt \
    {exp_design} \
    > OTU_abs{abs}_PCR_sub_qSIP_atom.txt

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file qSIP.log -j 2

In [ ]:
%pushnote qSIP complete: $buildDir

deltaBD


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_dBD.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

#deltaBD 
SIPSim deltaBD \
    OTU_abs{abs}_PCR_sub.txt \
    {exp_design} \
    > OTU_abs{abs}_PCR_sub_dBD.txt

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file deltaBD.log -j 4

In [ ]:
%pushnote deltaBD complete: $buildDir

Making confusion matrices


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_cMtx.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

# HR-SIP
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    BD-shift_stats.txt \
    OTU_abs{abs}_PCR_sub_filt_DESeq2

# HR-SIP multiple 'heavy' BD windows
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_multi-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_BD3_DESeq2    
    
# qSIP    
SIPSimR qSIP_confuseMtx \
    --libs 2,4,6 \
    BD-shift_stats.txt \
    OTU_abs{abs}_PCR_sub_qSIP_atom.txt

# heavy-SIP    
SIPSimR heavy_confuseMtx \
    --libs 2,4,6 \
    BD-shift_stats.txt \
    OTU_abs{abs}_PCR_sub.txt

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file cMtx.log -j 10

Aggregating the confusion matrix data


In [ ]:
def agg_cMtx(prefix):
    # all data
    #!nestagg delim \
    #    -d $buildDir \
    #    -k percIncorp,percTaxa,rep \
    #    -o $prefix-cMtx_data.txt \
    #    --tab \
    #    $prefix-cMtx_data.txt

    # overall
    x = prefix + '-cMtx_overall.txt'
    !nestagg delim \
        -d $buildDir \
        -k shared_perc,perm_perc,rep \
        -o $x \
        --tab \
        $x

    # by class
    x = prefix + '-cMtx_byClass.txt'
    !nestagg delim \
        -d $buildDir \
        -k shared_perc,perm_perc,rep \
        -o $x \
        --tab \
        $x
        
agg_cMtx('DESeq2')
agg_cMtx('qSIP') 
agg_cMtx('heavy')

In [ ]:
%pushnote microBetaDiv complete!

--End of simulation--


Plotting results


In [ ]:
F = os.path.join(buildDir, '*-cMtx_byClass.txt')
files = glob.glob(F)
files

In [ ]:
%%R -i files

df_byClass = list()
for (f in files){
    ff = strsplit(f, '/') %>% unlist
    fff = ff[length(ff)]
    df_byClass[[fff]] = read.delim(f, sep='\t')
}

df_byClass = do.call(rbind, df_byClass)
df_byClass$file = gsub('\\.[0-9]+$', '', rownames(df_byClass))
df_byClass$method = gsub('-.+', '', df_byClass$file)
rownames(df_byClass) = 1:nrow(df_byClass)

df_byClass %>% head(n=3)

In [ ]:
%%R
# renaming method
rename = data.frame(method = c('DESeq2', 'DESeq2_multi', 'heavy', 'qSIP'), 
                   method_new = c('HR-SIP', 'MW-HR-SIP', 'Heavy-SIP', 'qSIP'))

df_byClass = inner_join(df_byClass, rename, c('method'='method')) %>%
    select(-method) %>%
    rename('method' = method_new) 

df_byClass %>% head(n=3)

In [ ]:
%%R -w 800 -h 550
# summarize by SIPSim rep & library rep
df_byClass.s = df_byClass %>%
    group_by(method, shared_perc, perm_perc, variables) %>%
    summarize(mean_value = mean(values),
              sd_value = sd(values))

# plotting
ggplot(df_byClass.s, aes(variables, mean_value, color=method,
                         ymin=mean_value-sd_value,
                         ymax=mean_value+sd_value)) +
    geom_pointrange(alpha=0.8, size=0.2) +
    labs(y='Value') +
    facet_grid(shared_perc ~ perm_perc) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle=45, hjust=1)
    )

In [ ]:
%%R -w 800 -h 600
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
    filter(variables %in% vars) 

# plotting
ggplot(df_byClass.s.f, aes(variables, mean_value, fill=method,
                         ymin=mean_value-sd_value,
                         ymax=mean_value+sd_value)) +
    #geom_pointrange(alpha=0.8, size=0.2) +
    geom_bar(stat='identity', position='dodge', width=0.8) +
    geom_errorbar(stat='identity', position='dodge', width=0.8) +
    scale_y_continuous(breaks=seq(0, 1, 0.2)) +
    labs(y='Value') +
    facet_grid(shared_perc ~ perm_perc) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle=50, hjust=1)
    )

In [ ]:
%%R -w 800 -h 450
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
    filter(variables %in% vars) %>%
    ungroup() %>%
    mutate(perm_perc = perm_perc %>% as.character,
           perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric))

# plotting
ggplot(df_byClass.s.f, aes(shared_perc, mean_value, 
                           color=perm_perc, 
                           ymin=mean_value-sd_value,
                           ymax=mean_value+sd_value)) +
    geom_pointrange(alpha=0.8, size=0.2) +
    geom_line() +
    scale_color_discrete('% of rank\nabundances\npermuted') +
    labs(x='% taxa shared among replicate pre-fraction communities') +
    facet_grid(variables ~ method, scales='free_y') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.y = element_blank()
    )

atom excess estimates

qSIP


In [60]:
atomX_files = glob.glob('*/*/*/*_atom.txt')
len(atomX_files)


Out[60]:
150

In [61]:
%%R -i atomX_files

df_atomX = list()
for(F in atomX_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    tmp$shared_perc = FF[1]
    tmp$perm_perc = FF[2]
    tmp$rep = FF[3]
    tmp$file = FF[4]
    df_atomX[[F]] = tmp
}

df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3)


                                 taxon  control treatment       BD_diff
1       Acaryochloris_marina_MBIC11017 1.708081  1.708174  9.318302e-05
2 Acetobacter_pasteurianus_IFO_3283-03 1.711174  1.715641  4.466102e-03
3       Acetobacterium_woodii_DSM_1030 1.703430  1.699272 -4.157865e-03
  control_GC control_MW treatment_max_MW treatment_MW atom_fraction_excess
1  0.7427450   308.0594         317.6635     308.0762           0.00173042
2  0.7797938   308.0778         317.6634     308.8818           0.08295063
3  0.6870484   308.0318         317.6637     307.2799          -0.07719267
   atom_CI_low atom_CI_high shared_perc perm_perc rep
1 -0.020004565   0.02127429          80        30   7
2  0.008058996   0.14939521          80        30   7
3 -0.176977372   0.02339350          80        30   7
                              file
1 OTU_abs1e9_PCR_sub_qSIP_atom.txt
2 OTU_abs1e9_PCR_sub_qSIP_atom.txt
3 OTU_abs1e9_PCR_sub_qSIP_atom.txt

In [62]:
BDshift_files = glob.glob('*/*/*/BD-shift_stats.txt')
len(BDshift_files)


Out[62]:
150

In [63]:
%%R -i BDshift_files

df_shift = list()
for(F in BDshift_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    tmp$shared_perc = FF[1]
    tmp$perm_perc = FF[2]
    tmp$rep = FF[3]
    tmp$file = FF[4]
    df_shift[[F]] = tmp
}

df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)

df_shift = df_shift %>%
    filter(library %in% c(2,4,6)) %>%
    group_by(taxon, shared_perc, perm_perc, rep) %>%
    summarize(median = median(median)) %>%
    ungroup() %>%
    rename('median_true_BD_shift' = median)

df_shift %>% head(n=3)


Source: local data frame [3 x 5]

                           taxon shared_perc perm_perc   rep
                          (fctr)       (chr)     (chr) (chr)
1 Acaryochloris_marina_MBIC11017         100        30     1
2 Acaryochloris_marina_MBIC11017         100        30    10
3 Acaryochloris_marina_MBIC11017         100        30     2
Variables not shown: median_true_BD_shift (dbl)

In [64]:
%%R
df.j = inner_join(df_atomX, df_shift, c('taxon' = 'taxon',
                                       'shared_perc'='shared_perc',
                                       'perm_perc'='perm_perc',
                                       'rep'='rep')) %>%
    filter(!is.na(BD_diff)) %>%
    mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
           true_atom_fraction_excess = median_true_BD_shift / 0.036,
           atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess))
df.j %>% head(n=3)


                                 taxon  control treatment       BD_diff
1       Acaryochloris_marina_MBIC11017 1.708081  1.708174  9.318302e-05
2 Acetobacter_pasteurianus_IFO_3283-03 1.711174  1.715641  4.466102e-03
3       Acetobacterium_woodii_DSM_1030 1.703430  1.699272 -4.157865e-03
  control_GC control_MW treatment_max_MW treatment_MW atom_fraction_excess
1  0.7427450   308.0594         317.6635     308.0762           0.00173042
2  0.7797938   308.0778         317.6634     308.8818           0.08295063
3  0.6870484   308.0318         317.6637     307.2799          -0.07719267
   atom_CI_low atom_CI_high shared_perc perm_perc rep
1 -0.020004565   0.02127429          80        30   7
2  0.008058996   0.14939521          80        30   7
3 -0.176977372   0.02339350          80        30   7
                              file median_true_BD_shift true_incorporator
1 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
2 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
3 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
  true_atom_fraction_excess
1                         0
2                         0
3                         0

In [ ]:
%%R
# free memory
df_shift = df_atomX = NULL

In [65]:
%%R -h 250 
# difference between true and estimated
df.j.dis.qSIP = df.j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    group_by(shared_perc, perm_perc, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
           perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric)) %>%
    filter(true_incorporator == TRUE)
   
# plotting
ggplot(df.j.dis.qSIP, aes(shared_perc, mean_delta_excess, 
                      color=perm_perc, group=perm_perc,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_point() +
    geom_linerange(alpha=0.5, size=0.8) +
    geom_line() +
    scale_color_discrete('% taxa ranks\npermuted') +
    labs(x='% taxa shared', y='atom % excess\n(truth - estimate)') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )


deltaBD


In [66]:
dBD_files = glob.glob('*/*/*/*_dBD.txt')
len(dBD_files)


Out[66]:
150

In [67]:
%%R -i dBD_files

df_dBD = list()
for(F in dBD_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    tmp$shared_perc = FF[1]
    tmp$perm_perc = FF[2]
    tmp$rep = FF[3]
    tmp$file = FF[4]
    df_dBD[[F]] = tmp
}

df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3)


                                 taxon mean_CM_control mean_CM_treatment
1       Acaryochloris_marina_MBIC11017        1.715176          1.706250
2 Acetobacter_pasteurianus_IFO_3283-03        1.723370          1.720191
3       Acetobacterium_woodii_DSM_1030        1.702231          1.697283
  stdev_CM_control stdev_CM_treatment     delta_BD shared_perc perm_perc rep
1      0.001707632        0.002344381 -0.008926406          80        30   7
2      0.002839386        0.006674992 -0.003179290          80        30   7
3      0.002627010        0.004448974 -0.004947882          80        30   7
                        file
1 OTU_abs1e9_PCR_sub_dBD.txt
2 OTU_abs1e9_PCR_sub_dBD.txt
3 OTU_abs1e9_PCR_sub_dBD.txt

In [ ]:
BDshift_files = glob.glob('*/*/*/BD-shift_stats.txt')
len(BDshift_files)


Out[ ]:
150

In [ ]:
%%R -i BDshift_files

df_shift = list()
for(F in BDshift_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    tmp$shared_perc = FF[1]
    tmp$perm_perc = FF[2]
    tmp$rep = FF[3]
    tmp$file = FF[4]
    df_shift[[F]] = tmp
}

df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)

df_shift = df_shift %>%
    filter(library %in% c(2,4,6)) %>%
    group_by(taxon, shared_perc, perm_perc, rep) %>%
    summarize(median = median(median)) %>%
    ungroup() %>%
    rename('median_true_BD_shift' = median)

df_shift %>% head(n=3)

In [ ]:
%%R
df.j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
                                       'shared_perc'='shared_perc',
                                       'perm_perc'='perm_perc',
                                       'rep'='rep')) %>%
    filter(!is.na(delta_BD)) %>%
    mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
           true_atom_fraction_excess = median_true_BD_shift / 0.036,
           atom_fraction_excess = delta_BD / 0.036)
df.j %>% head(n=3)

In [ ]:
%%R
# free memory
df_shift = df_dBD = NULL

In [ ]:
%%R -h 250 
# difference between true and estimated
df.j.dis.dBD = df.j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    group_by(shared_perc, perm_perc, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
           perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric)) %>%
    filter(true_incorporator == TRUE)
   
# plotting
ggplot(df.j.dis.dBD, aes(shared_perc, mean_delta_excess, 
                      color=perm_perc, group=perm_perc,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_point() +
    geom_linerange(alpha=0.5, size=0.8) +
    geom_line() +
    scale_color_discrete('% taxa ranks\npermuted') +
    labs(x='% taxa shared', y='atom % excess\n(truth - estimate)') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

combined plot


In [ ]:
%%R -w 670 -h 250
df.jj = rbind(df.j.dis.qSIP %>% mutate(method='qSIP'),
              df.j.dis.dBD %>% mutate(method='ΔBD'))

# plotting
ggplot(df.jj, aes(shared_perc, mean_delta_excess, 
                      color=perm_perc, group=perm_perc,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_point() +
    geom_linerange(alpha=0.5, size=0.5) +
    geom_line() +
    geom_line() +
    scale_color_discrete('% taxa ranks\npermuted') +
    labs(x='% taxa shared', y='13C atom % excess\n(truth - estimate)') +
    facet_grid(. ~ method) +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

In [ ]: