Accuracy as a function of isotope incorporation & community similarity among replicate microcosms
In [1]:
# paths
import os
workDir = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/'
buildDir = os.path.join(workDir, 'microBetaDiv')
R_dir = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/SIPSimR/scripts/'
fragFile = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/ampFrags_kde.pkl'
genome_index = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/genome_index.txt'
In [2]:
import glob
import itertools
import nestly
In [3]:
%load_ext pushmsg
In [4]:
if not os.path.isdir(buildDir):
os.makedirs(buildDir)
%cd $buildDir
In [5]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
print('Min BD: {}'.format(min_BD))
print('Max BD: {}'.format(max_BD))
In [7]:
# making an experimental design file for qSIP
x = range(1,7)
y = ['control', 'treatment']
expDesignFile = os.path.join(buildDir, 'qSIP_exp_design.txt')
with open(expDesignFile, 'wb') as outFH:
for i,z in itertools.izip(x,itertools.cycle(y)):
line = '\t'.join([str(i),z])
outFH.write(line + '\n')
!head $expDesignFile
In [8]:
# building tree structure
nest = nestly.Nest()
# varying params
nest.add('shared_perc', [80, 85, 90, 95, 100])
nest.add('perm_perc', [0, 5, 10, 15, 20])
nest.add('rep', range(1,11))
## set params
nest.add('percIncorp', [50], create_dir=False)
nest.add('percTaxa', [10], create_dir=False)
nest.add('abs', ['1e9'], create_dir=False)
#nest.add('abs', ['1e7'], create_dir=False) # TEST
nest.add('np', [10], create_dir=False)
nest.add('Monte_rep', [100000], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
nest.add('min_BD', [min_BD], create_dir=False)
nest.add('max_BD', [max_BD], create_dir=False)
nest.add('DBL_scaling', [0.5], create_dir=False)
nest.add('bandwidth', [0.8], create_dir=False)
nest.add('heavy_BD_min', [1.71], create_dir=False)
nest.add('heavy_BD_max', [1.75], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)
### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('genome_index', [genome_index], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('exp_design', [expDesignFile], create_dir=False)
# building directory tree
nest.build(buildDir)
# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')
In [9]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_expDesign.sh'
bashFileTmp
Out[9]:
In [10]:
%%writefile $bashFileTmp
#!/bin/bash
source activate SIPSim
# OPENBLAS threads
export OMP_NUM_THREADS=1
echo '#-- Experimental design --#'
echo '# Making an isotope incorporation config file'
echo '## 3 replicate gradients for control & treatment'
SIPSim incorp_config_example \
--percIncorpUnif {percIncorp} \
--n_reps 3 \
> incorp.config
echo '# Selecting incorporator taxa'
echo '## This is to make the gradient replicates consistent (qSIP finds mean among replicates)'
SIPSim KDE_select_taxa \
-p {percTaxa} \
{fragFile} \
> incorporators.txt
echo '# Creating a community file (3 replicate control, 3 replicate treatment)'
SIPSim communities \
--richness 950 \
--config incorp.config \
--shared_perc {shared_perc} \
--perm_perc {perm_perc} \
{genome_index} \
> comm.txt
echo '# simulating gradient fractions'
SIPSim gradient_fractions \
--BD_min {min_BD} \
--BD_max {max_BD} \
comm.txt \
> fracs.txt
In [11]:
!chmod 755 $bashFileTmp
In [12]:
%%bash -s $workDir $bashFileTmp $buildDir
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate py2_ley0.4
# change to working dir
cd $1
# run job
nestrun --template-file $2 -d $3 --log-file exp_design.log -j 20
In [13]:
%pushmsg "exp_design complete: $buildDir"
In [15]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-pipeline.sh'
bashFileTmp
Out[15]:
In [16]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
# OPENBLAS threads
export OMP_NUM_THREADS=1
echo '#-- SIPSim pipeline --#'
echo '# Adding diffusion'
SIPSim diffusion \
-n {Monte_rep} \
--bw {bandwidth} \
--np {np} \
{fragFile} \
> ampFrags_KDE_dif.pkl
echo '# Adding DBL contamination; abundance-weighted smearing'
SIPSim DBL \
-n {Monte_rep} \
--comm comm.txt \
--commx {DBL_scaling} \
--np {np} \
ampFrags_KDE_dif.pkl \
> ampFrags_KDE_dif_DBL.pkl
echo '# Adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
-n {Monte_rep} \
--comm comm.txt \
--taxa incorporators.txt \
--np {np} \
ampFrags_KDE_dif_DBL.pkl \
incorp.config \
> ampFrags_KDE_dif_DBL_inc.pkl
echo '# Simulating an OTU table'
SIPSim OTU_table \
--abs {abs} \
--np {np} \
ampFrags_KDE_dif_DBL_inc.pkl \
comm.txt \
fracs.txt \
> OTU_abs{abs}.txt
echo '# Simulating PCR'
SIPSim OTU_PCR \
OTU_abs{abs}.txt \
> OTU_abs{abs}_PCR.txt
echo '# Subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
--dist {subsample_dist} \
--dist_params mean:{subsample_mean},sigma:{subsample_scale} \
--min_size {subsample_min} \
--max_size {subsample_max} \
OTU_abs{abs}_PCR.txt \
> OTU_abs{abs}_PCR_sub.txt
echo '# Making a wide-formatted table'
SIPSim OTU_wide_long -w \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_w.txt
echo '# Making metadata (phyloseq: sample_data)'
SIPSim OTU_sample_data \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_meta.txt
#-- removing large intermediate files --#
rm -f ampFrags_KDE_dif.pkl
rm -f ampFrags_KDE_dif_DBL.pkl
rm -f ampFrags_KDE_dif_DBL_inc.pkl
In [17]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file SIPSim_pipeline.log -j 6 --stop-on-error
In [ ]:
%pushmsg "SIPSim pipeline complete: $buildDir"
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-summary.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
echo "# Plotting taxon abundances"
# plotting 'raw' taxon abundances
Rscript {R_dir}OTU_taxonAbund.R \
OTU_abs{abs}.txt \
-r {topTaxaToPlot} \
-o OTU_abs{abs}
# plotting 'sequenced' taxon abundances
Rscript {R_dir}OTU_taxonAbund.R \
OTU_abs{abs}_PCR_sub.txt \
-r {topTaxaToPlot} \
-o OTU_abs{abs}_PCR_sub
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file SIPSim_summary.log -j 20
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_HRSIP.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
# phyloseq
## making phyloseq object from OTU table
Rscript {R_dir}phyloseq_make.R \
OTU_abs{abs}_PCR_sub_w.txt \
-s OTU_abs{abs}_PCR_sub_meta.txt \
> OTU_abs{abs}_PCR_sub.physeq
## filtering phyloseq object to just 'heavy' fractions
Rscript {R_dir}phyloseq_edit.R \
OTU_abs{abs}_PCR_sub.physeq \
--BD_min {heavy_BD_min} \
--BD_max {heavy_BD_max} \
> OTU_abs{abs}_PCR_sub_filt.physeq
## making ordination
Rscript {R_dir}phyloseq_ordination.R \
OTU_abs{abs}_PCR_sub_filt.physeq \
OTU_abs{abs}_PCR_sub_filt_bray-NMDS.pdf
# DESeq2
Rscript {R_dir}phyloseq_DESeq2.R \
--log2 {log2} \
--hypo greater \
--cont 1,3,5 \
--treat 2,4,6 \
OTU_abs{abs}_PCR_sub_filt.physeq \
> OTU_abs{abs}_PCR_sub_filt_DESeq2
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file HR-SIP.log -j 20
In [ ]:
%pushmsg "HR-SIP complete: $buildDir"
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_MWHRSIP.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
## HR SIP pipeline
Rscript {R_dir}phyloseq_DESeq2.R \
--log2 {log2} \
--hypo greater \
--cont 1,3,5 \
--treat 2,4,6 \
--occur_all 0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5 \
-w 1.70-1.73,1.72-1.75,1.74-1.77 \
--all OTU_abs1e9_PCR_sub_MW-all.txt \
OTU_abs{abs}_PCR_sub.physeq \
> OTU_abs{abs}_PCR_sub_filt_MW_DESeq2
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file MW-HR-SIP.log -j 20
In [ ]:
%pushmsg "MW-HR-SIP complete: $buildDir"
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_qSIP.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
# OPENBLAS threads
export OMP_NUM_THREADS=1
# qSIP
SIPSim qSIP \
OTU_abs{abs}.txt \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_qSIP.txt
# qSIP: atom excess
SIPSim qSIP_atom_excess \
--np {np} \
OTU_abs{abs}_PCR_sub_qSIP.txt \
{exp_design} \
> OTU_abs{abs}_PCR_sub_qSIP_atom.txt
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file qSIP.log -j 6
In [ ]:
%pushmsg "q-SIP complete: $buildDir"
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_dBD.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
# OPENBLAS threads
export OMP_NUM_THREADS=1
#deltaBD
SIPSim deltaBD \
OTU_abs{abs}_PCR_sub.txt \
{exp_design} \
> OTU_abs{abs}_PCR_sub_dBD.txt
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file deltaBD.log -j 20
In [ ]:
%pushmsg "deltaBD complete: $buildDir"
In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_cMtx.sh'
bashFileTmp
In [ ]:
%%writefile $bashFileTmp
#!/bin/bash
# offset job start to prevent conda activate errors
sleep $[ ( $RANDOM % 10 ) + 1 ]s
source activate SIPSim
# HR-SIP
Rscript {R_dir}DESeq2_confuseMtx.R \
--libs 2,4,6 \
--padj {padj} \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_filt_DESeq2
# HR-SIP multiple 'heavy' BD windows
Rscript {R_dir}DESeq2_confuseMtx.R \
--libs 2,4,6 \
--padj {padj} \
-o DESeq2_multi-cMtx \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_filt_MW_DESeq2
# qSIP
Rscript {R_dir}qSIP_confuseMtx.R \
--libs 2,4,6 \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_qSIP_atom.txt
# heavy-SIP
Rscript {R_dir}heavy_confuseMtx.R \
--treat 2,4,6 \
--con 1,3,5 \
--method 1 \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub.txt
In [ ]:
!chmod 755 $bashFileTmp
In [ ]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file cMtx.log -j 20
In [ ]:
def agg_cMtx(prefix):
# all data
x = prefix + '-cMtx_data.txt'
!nestagg delim \
-d $buildDir \
-k shared_perc,perm_perc,rep \
-o $x \
--tab \
$x
# overall
x = prefix + '-cMtx_overall.txt'
!nestagg delim \
-d $buildDir \
-k shared_perc,perm_perc,rep \
-o $x \
--tab \
$x
# by class
x = prefix + '-cMtx_byClass.txt'
!nestagg delim \
-d $buildDir \
-k shared_perc,perm_perc,rep \
-o $x \
--tab \
$x
agg_cMtx('DESeq2')
agg_cMtx('DESeq2_multi')
agg_cMtx('qSIP')
agg_cMtx('heavy')
In [ ]:
%pushmsg "microBetaDiv complete!"
In [60]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_betaDiv.sh'
bashFileTmp
Out[60]:
In [61]:
%%writefile $bashFileTmp
#!/bin/bash
source activate SIPSim
# beta diversity
Rscript {R_dir}comm_beta_div.R \
comm.txt \
> comm_betaDiv.txt
In [62]:
!chmod 755 $bashFileTmp
In [63]:
%%bash -s $workDir $bashFileTmp $buildDir
source activate py2_ley0.4
cd $1
nestrun --template-file $2 -d $3 --log-file comm_betaDiv.log -j 10
In [64]:
F = os.path.join(buildDir, '*-cMtx_byClass.txt')
files = glob.glob(F)
files
Out[64]:
In [65]:
# checking for errors
!find $buildDir -name "*log" | wc -l
!find $buildDir -name "*log" | xargs grep -i error
In [ ]: