Goal

  • Follow-up to: atomIncorp_taxaIncorp
  • Determining the effect of 'heavy' BD window (number of windows & window sizes) on HR-SIP accuracy
  • Apply a sparsity cutoff prior to selecting 'heavy' fraction samples
    • In other words, taxa must be present in most of the gradient fractions across the whole gradient

Variable parameters:

  • 'heavy' BD window sizes

Init


In [2]:
import os
import glob
import itertools
import nestly

In [3]:
%load_ext rpy2.ipython
%load_ext pushnote

In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

BD min/max


In [5]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

print 'Min BD: {}'.format(min_BD)
print 'Max BD: {}'.format(max_BD)


Min BD: 1.67323
Max BD: 1.7744

Nestly

  • assuming fragments already simulated

In [6]:
# paths
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1147/'
buildDir = os.path.join(workDir, 'atomIncorp_taxaIncorp_MW-HR-SIP_preSpar')
dataDir = os.path.join(workDir, 'atomIncorp_taxaIncorp')

In [7]:
if not os.path.isdir(buildDir):
    os.makedirs(buildDir)
%cd $buildDir


/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar

In [8]:
# making an experimental design file for qSIP
x = range(1,7)
y = ['control', 'treatment']

expDesignFile = os.path.join(buildDir, 'qSIP_exp_design.txt')
with open(expDesignFile, 'wb') as outFH:
    for i,z in itertools.izip(x,itertools.cycle(y)):
        line = '\t'.join([str(i),z])
        outFH.write(line + '\n')

!head $expDesignFile


1	control
2	treatment
3	control
4	treatment
5	control
6	treatment

Nestly params


In [8]:
# building tree structure
nest = nestly.Nest()

# varying params
nest.add('percIncorp', [0, 15, 25, 50, 100])
nest.add('percTaxa', [1, 5, 10, 25, 50])
nest.add('rep', range(1,11))

## set params
nest.add('abs', ['1e9'], create_dir=False)
nest.add('np', [10], create_dir=False)
nest.add('Monte_rep', [100000], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
nest.add('min_BD', [min_BD], create_dir=False)
nest.add('max_BD', [max_BD], create_dir=False)
nest.add('DBL_scaling', [0.5], create_dir=False)
nest.add('bandwidth', [0.8], create_dir=False)
nest.add('heavy_BD_min', [1.71], create_dir=False)
nest.add('heavy_BD_max', [1.75], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)
nest.add('occurs', ['0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5'], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('exp_design', [expDesignFile], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')

Copying input files


In [9]:
files = !find . -name "*.json"
dirs = [os.path.split(x)[0] for x in files]

srcFiles = ['OTU_abs1e9_PCR_sub_w.txt', 'OTU_abs1e9_PCR_sub_meta.txt', 'BD-shift_stats.txt']

for d in dirs:
    for f in srcFiles:
        f1 = os.path.join(dataDir, d, f)
        f2 = os.path.join(buildDir, d, f)
        cmd = 'cp -f {} {}'.format(f1, f2)
        !$cmd

Multi-window HR-SIP


In [10]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_HRSIP_multi.sh'
bashFileTmp


Out[10]:
'/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/SIPSimRun_HRSIP_multi.sh'

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

# phyloseq
## making phyloseq object from OTU table
SIPSimR phyloseq_make \
    OTU_abs{abs}_PCR_sub_w.txt \
    -s OTU_abs{abs}_PCR_sub_meta.txt \
    > OTU_abs{abs}_PCR_sub.physeq   
    
## HR SIP pipeline
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    --occur_all {occurs} \
    -w 1.71-1.75 \
    --all OTU_abs1e9_PCR_sub_MW1_all.txt \
    OTU_abs{abs}_PCR_sub.physeq \
    > OTU_abs1e9_PCR_sub_MW1_DS2.txt
       
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    --occur_all {occurs} \
    -w 1.71-1.78 \
    --all OTU_abs1e9_PCR_sub_MW2_all.txt \
    OTU_abs{abs}_PCR_sub.physeq \
    > OTU_abs1e9_PCR_sub_MW2_DS2.txt
    
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    --occur_all {occurs} \
    -w 1.69-1.74,1.73-1.78 \
    --all OTU_abs1e9_PCR_sub_MW3_all.txt \
    OTU_abs{abs}_PCR_sub.physeq \
    > OTU_abs1e9_PCR_sub_MW3_DS2.txt
       
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    --occur_all {occurs} \
    -w 1.70-1.73,1.72-1.75,1.74-1.77 \
    --all OTU_abs1e9_PCR_sub_MW4_all.txt \
    OTU_abs{abs}_PCR_sub.physeq \
    > OTU_abs1e9_PCR_sub_MW4_DS2.txt
  
SIPSimR phyloseq_DESeq2 \
    --log2 {log2} \
    --hypo greater \
    --cont 1,3,5 \
    --treat 2,4,6 \
    --occur_all {occurs} \
    -w 1.69-1.73,1.72-1.76,1.75-1.79 \
    --all OTU_abs1e9_PCR_sub_MW5_all.txt \
    OTU_abs{abs}_PCR_sub.physeq \
    > OTU_abs1e9_PCR_sub_MW5_DS2.txt


Writing /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/SIPSimRun_HRSIP_multi.sh

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file HR-SIP_multi.log -j 10


2016-04-17 16:49:46,189 * INFO * Template: ./SIPSimRun_HRSIP_multi.sh
2016-04-17 16:49:46,191 * INFO * [191741] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/7
2016-04-17 16:49:46,192 * INFO * [191742] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/9
2016-04-17 16:49:46,193 * INFO * [191744] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/8
2016-04-17 16:49:46,195 * INFO * [191746] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/3
2016-04-17 16:49:46,196 * INFO * [191748] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/10
2016-04-17 16:49:46,197 * INFO * [191750] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/2
2016-04-17 16:49:46,198 * INFO * [191752] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/1
2016-04-17 16:49:46,199 * INFO * [191754] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/4
2016-04-17 16:49:46,201 * INFO * [191756] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/5
2016-04-17 16:49:46,206 * INFO * [191758] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/6
2016-04-17 16:59:41,365 * INFO * [191750] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/2 Finished with 0
2016-04-17 16:59:41,367 * INFO * [198252] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/7
2016-04-17 16:59:48,006 * INFO * [191758] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/6 Finished with 0
2016-04-17 16:59:48,009 * INFO * [198277] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/9
2016-04-17 17:00:07,051 * INFO * [191756] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/5 Finished with 0
2016-04-17 17:00:07,056 * INFO * [198374] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/8
2016-04-17 17:00:21,585 * INFO * [191742] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/9 Finished with 0
2016-04-17 17:00:21,587 * INFO * [198489] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/3
2016-04-17 17:00:24,853 * INFO * [191754] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/4 Finished with 0
2016-04-17 17:00:24,855 * INFO * [198514] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/10
2016-04-17 17:00:32,431 * INFO * [191752] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/1 Finished with 0
2016-04-17 17:00:32,433 * INFO * [198586] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/2
2016-04-17 17:00:41,976 * INFO * [191748] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/10 Finished with 0
2016-04-17 17:00:41,978 * INFO * [198672] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/1
2016-04-17 17:00:42,815 * INFO * [191741] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/7 Finished with 0
2016-04-17 17:00:42,818 * INFO * [198698] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/4
2016-04-17 17:01:18,580 * INFO * [191746] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/3 Finished with 0
2016-04-17 17:01:18,582 * INFO * [199064] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/5
2016-04-17 17:01:23,782 * INFO * [191744] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/50/8 Finished with 0
2016-04-17 17:01:23,784 * INFO * [199100] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/6
2016-04-17 17:10:25,316 * INFO * [198374] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/8 Finished with 0
2016-04-17 17:10:25,318 * INFO * [203518] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/7
2016-04-17 17:10:35,839 * INFO * [198252] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/7 Finished with 0
2016-04-17 17:10:35,841 * INFO * [203565] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/9
2016-04-17 17:11:04,335 * INFO * [198514] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/10 Finished with 0
2016-04-17 17:11:04,337 * INFO * [203675] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/8
2016-04-17 17:11:14,606 * INFO * [198277] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/9 Finished with 0
2016-04-17 17:11:14,608 * INFO * [203755] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/3
2016-04-17 17:11:20,741 * INFO * [198489] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/3 Finished with 0
2016-04-17 17:11:20,743 * INFO * [203788] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/10
2016-04-17 17:11:23,561 * INFO * [198672] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/1 Finished with 0
2016-04-17 17:11:23,563 * INFO * [203819] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/2
2016-04-17 17:11:33,901 * INFO * [198586] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/2 Finished with 0
2016-04-17 17:11:33,904 * INFO * [204409] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/1
2016-04-17 17:11:53,727 * INFO * [198698] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/4 Finished with 0
2016-04-17 17:11:53,729 * INFO * [204636] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/4
2016-04-17 17:12:01,198 * INFO * [199064] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/5 Finished with 0
2016-04-17 17:12:01,200 * INFO * [204673] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/5
2016-04-17 17:12:29,854 * INFO * [199100] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/25/6 Finished with 0
2016-04-17 17:12:29,856 * INFO * [204882] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/6
2016-04-17 17:20:51,079 * INFO * [203675] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/8 Finished with 0
2016-04-17 17:20:51,081 * INFO * [208847] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/1/7
2016-04-17 17:21:25,099 * INFO * [203518] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/7 Finished with 0
2016-04-17 17:21:25,101 * INFO * [208998] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/1/9
2016-04-17 17:21:43,116 * INFO * [203755] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/3 Finished with 0
2016-04-17 17:21:43,118 * INFO * [209506] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/1/8
2016-04-17 17:21:51,365 * INFO * [203565] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/9 Finished with 0
2016-04-17 17:21:51,367 * INFO * [209585] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/1/3
2016-04-17 17:22:04,142 * INFO * [203788] /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/10/10 Finished with 0
2016-04-17 17:22:04,145 * INFO * [209706] Started ./SIPSimRun_HRSIP_multi.sh in /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/50/1/10
**OUTPUT MUTED**

In [ ]:
%pushnote preSpar MW-HR-SIP complete

Making confusion matrices


In [ ]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_cMtx.sh'
bashFileTmp

In [ ]:
%%writefile $bashFileTmp
#!/bin/bash

# HR-SIP multiple 'heavy' BD windows
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_MW1-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_MW1_DS2.txt

SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_MW2-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_MW2_DS2.txt    
    
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_MW3-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_MW3_DS2.txt
    
    
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_MW4-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_MW4_DS2.txt
    
    
SIPSimR DESeq2_confuseMtx \
    --libs 2,4,6 \
    --padj {padj} \
    -o DESeq2_MW5-cMtx \
    BD-shift_stats.txt \
    OTU_abs1e9_PCR_sub_MW5_DS2.txt

In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
    nestrun --template-file $bashFileTmp -d $buildDir --log-file cMtx.log -j 10

Aggregating the confusion matrix data


In [ ]:
def agg_cMtx(prefix):
    # all data
    #!nestagg delim \
    #    -d $buildDir \
    #    -k percIncorp,percTaxa,rep \
    #    -o $prefix-cMtx_data.txt \
    #    --tab \
    #    $prefix-cMtx_data.txt

    # overall
    x = prefix + '-cMtx_overall.txt'
    !nestagg delim \
        -d $buildDir \
        -k percIncorp,percTaxa,rep \
        -o $x \
        --tab \
        $x

    # by class
    x = prefix + '-cMtx_byClass.txt'
    !nestagg delim \
        -d $buildDir \
        -k percIncorp,percTaxa,rep \
        -o $x \
        --tab \
        $x
        
agg_cMtx('DESeq2_MW1')
agg_cMtx('DESeq2_MW2')
agg_cMtx('DESeq2_MW3')
agg_cMtx('DESeq2_MW4')
agg_cMtx('DESeq2_MW5')

In [ ]:
%pushnote preSpar MW-HR-SIP run complete!

--End of simulation--


Plotting results


In [31]:
F = os.path.join(buildDir, '*-cMtx_byClass.txt')
files = glob.glob(F)
files


Out[31]:
['/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/DESeq2_MW1-cMtx_byClass.txt',
 '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/DESeq2_MW3-cMtx_byClass.txt',
 '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/DESeq2_MW4-cMtx_byClass.txt',
 '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/DESeq2_MW2-cMtx_byClass.txt',
 '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/DESeq2_MW5-cMtx_byClass.txt']

In [32]:
%%R -i files

df_byClass = list()
for (f in files){
    ff = strsplit(f, '/') %>% unlist
    fff = ff[length(ff)]
    df_byClass[[fff]] = read.delim(f, sep='\t')
}

df_byClass = do.call(rbind, df_byClass)
df_byClass$file = gsub('\\.[0-9]+$', '', rownames(df_byClass))
df_byClass$method = gsub('-cMtx.+', '', df_byClass$file)
rownames(df_byClass) = 1:nrow(df_byClass)

df_byClass %>% head(n=3)


  library      variables    values percIncorp percTaxa rep
1       2    Sensitivity 0.6757991         50       50   7
2       2    Specificity 1.0000000         50       50   7
3       2 Pos Pred Value 1.0000000         50       50   7
                         file     method
1 DESeq2_MW1-cMtx_byClass.txt DESeq2_MW1
2 DESeq2_MW1-cMtx_byClass.txt DESeq2_MW1
3 DESeq2_MW1-cMtx_byClass.txt DESeq2_MW1

In [33]:
%%R
# renaming method
rename = data.frame(method = c('DESeq2_MW1', 'DESeq2_MW2', 'DESeq2_MW3', 'DESeq2_MW4', 'DESeq2_MW4'), 
                    method_new = c('1.71-1.75',
                                   '1.71-1.78',
                                   '1.69-1.74,\n1.73-1.78',
                                   '1.70-1.73,\n1.72-1.75,\n1.74-1.77',
                                   '1.69-1.73,\n1.72-1.76,\n1.75-1.79'))

df_byClass = inner_join(df_byClass, rename, c('method'='method')) %>%
    select(-method) %>%
    rename('method' = method_new) 

df_byClass$method = factor(df_byClass$method, levels=rename$method_new %>% as.vector)

df_byClass %>% head(n=3)


  library      variables    values percIncorp percTaxa rep
1       2    Sensitivity 0.6757991         50       50   7
2       2    Specificity 1.0000000         50       50   7
3       2 Pos Pred Value 1.0000000         50       50   7
                         file    method
1 DESeq2_MW1-cMtx_byClass.txt 1.71-1.75
2 DESeq2_MW1-cMtx_byClass.txt 1.71-1.75
3 DESeq2_MW1-cMtx_byClass.txt 1.71-1.75

In [34]:
%%R -w 800 -h 550
# summarize by SIPSim rep & library rep
df_byClass.s = df_byClass %>%
    group_by(method, percIncorp, percTaxa, variables) %>%
    summarize(mean_value = mean(values),
              sd_value = sd(values))

# plotting
ggplot(df_byClass.s, aes(variables, mean_value, color=method,
                         ymin=mean_value-sd_value,
                         ymax=mean_value+sd_value)) +
    geom_pointrange(alpha=0.8, size=0.2) +
    labs(y='Value') +
    facet_grid(percTaxa ~ percIncorp) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle=45, hjust=1)
    )



In [35]:
%%R -w 850 -h 600
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
    filter(variables %in% vars) 

# plotting
ggplot(df_byClass.s.f, aes(variables, mean_value, fill=method,
                         ymin=mean_value-sd_value,
                         ymax=mean_value+sd_value)) +
    #geom_pointrange(alpha=0.8, size=0.2) +
    geom_bar(stat='identity', position='dodge', width=0.8) +
    geom_errorbar(stat='identity', position='dodge', width=0.8) +
    scale_y_continuous(breaks=seq(0, 1, 0.2)) +
    scale_fill_discrete('"Heavy" BD window(s)') +
    facet_grid(percTaxa ~ percIncorp) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle=45, hjust=1),
        axis.title.y = element_blank()
    )



In [53]:
%%R -w 750 -h 550
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
    filter(variables %in% vars) %>%
    ungroup() %>%
    mutate(percTaxa = percTaxa %>% as.character,
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))


# plotting
p.pnt = ggplot(df_byClass.s.f, aes(percIncorp, mean_value, 
                           color=percTaxa, 
                           group=percTaxa,
                           ymin=mean_value-sd_value,
                           ymax=mean_value+sd_value)) +
    geom_pointrange(alpha=0.8, size=0.2) +
    geom_line() +
    scale_color_discrete('% incorp-\norators') +
    labs(x='% taxa shared among replicate unfractionated communities') +
    facet_grid(method ~ variables) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.title.y = element_blank()
    )
p.pnt



In [54]:
%%R -i workDir
outFile = 'atomIncorp_taxaIncorp_MW-HR-SIP.pdf'
ggsave(outFile, p.pnt, width=9, height=7.3)
cat('File written:', file.path(getwd(), outFile), '\n')


File written: /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp_MW-HR-SIP_preSpar/atomIncorp_taxaIncorp_MW-HR-SIP.pdf 

Checking that specificity is not always 1 (perfect)


In [37]:
%%R -h 250 -w 650
df_byClass.sf = df_byClass %>% 
    filter(variables == 'Specificity') 

max_val = max(df_byClass.sf$values, na.rm=TRUE)

ggplot(df_byClass.sf, aes(values)) +
    geom_histogram() +
    scale_y_log10() +
    labs(x='Specificity') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [ ]:


In [ ]: