In [138]:
# paths
import os
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1147/'
buildDir = os.path.join(workDir, 'atomIncorp_taxaIncorp')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
fragFile = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde.pkl'
genome_index = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_index.txt'
In [139]:
import glob
import itertools
import nestly
In [140]:
%load_ext rpy2.ipython
%load_ext pushnote
In [141]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [142]:
if not os.path.isdir(buildDir):
os.makedirs(buildDir)
%cd $buildDir
In [143]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
print 'Min BD: {}'.format(min_BD)
print 'Max BD: {}'.format(max_BD)
In [144]:
# making an experimental design file for qSIP
x = range(1,7)
y = ['control', 'treatment']
expDesignFile = os.path.join(buildDir, 'qSIP_exp_design.txt')
with open(expDesignFile, 'wb') as outFH:
for i,z in itertools.izip(x,itertools.cycle(y)):
line = '\t'.join([str(i),z])
outFH.write(line + '\n')
!head $expDesignFile
In [170]:
# building tree structure
nest = nestly.Nest()
# varying params: test
#nest.add('percIncorp', [100])
#nest.add('percTaxa', [10])
#nest.add('rep', range(1,4))
# varying params
nest.add('percIncorp', [0, 15, 25, 50, 100])
nest.add('percTaxa', [1, 5, 10, 25, 50])
nest.add('rep', range(1,11))
## set params
nest.add('abs', ['1e9'], create_dir=False)
nest.add('np', [10], create_dir=False)
nest.add('Monte_rep', [100000], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
nest.add('min_BD', [min_BD], create_dir=False)
nest.add('max_BD', [max_BD], create_dir=False)
nest.add('DBL_scaling', [0.5], create_dir=False)
nest.add('bandwidth', [0.8], create_dir=False)
nest.add('heavy_BD_min', [1.71], create_dir=False)
nest.add('heavy_BD_max', [1.75], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)
### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('genome_index', [genome_index], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('exp_design', [expDesignFile], create_dir=False)
# building directory tree
nest.build(buildDir)
# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')
In [179]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_expDesign.sh'
bashFileTmp
Out[179]:
In [180]:
%%writefile $bashFileTmp
#!/bin/bash
echo '#-- Experimental design --#'
echo '# Making an isotope incorporation config file'
echo '## 3 replicate gradients for control & treatment'
SIPSim incorpConfigExample \
--percIncorpUnif {percIncorp} \
--n_reps 3 \
> incorp.config
echo '# Selecting incorporator taxa'
echo '## This is to make the gradient replicates consistent (qSIP finds mean among replicates)'
SIPSim KDE_selectTaxa \
-p {percTaxa} \
{fragFile} \
> incorporators.txt
echo '# Creating a community file (3 replicate control, 3 replicate treatment)'
SIPSim communities \
--config incorp.config \
{genome_index} \
> comm.txt
echo '# simulating gradient fractions'
SIPSim gradient_fractions \
--BD_min {min_BD} \
--BD_max {max_BD} \
comm.txt \
> fracs.txt
In [51]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file exp_design.log -j 10
In [52]:
%pushnote exp_design complete: $buildDir
In [182]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-pipeline.sh'
bashFileTmp
Out[182]:
In [183]:
%%writefile $bashFileTmp
#!/bin/bash
echo '#-- SIPSim pipeline --#'
echo '# Adding diffusion'
SIPSim diffusion \
-n {Monte_rep} \
--bw {bandwidth} \
--np {np} \
{fragFile} \
> ampFrags_KDE_dif.pkl
echo '# Adding DBL contamination; abundance-weighted smearing'
SIPSim DBL \
-n {Monte_rep} \
--comm comm.txt \
--commx {DBL_scaling} \
--np {np} \
ampFrags_KDE_dif.pkl \
> ampFrags_KDE_dif_DBL.pkl
echo '# Adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
-n {Monte_rep} \
--comm comm.txt \
--taxa incorporators.txt \
--np {np} \
ampFrags_KDE_dif_DBL.pkl \
incorp.config \
> ampFrags_KDE_dif_DBL_inc.pkl
echo '# Simulating an OTU table'
SIPSim OTU_table \
--abs {abs} \
--np {np} \
ampFrags_KDE_dif_DBL_inc.pkl \
comm.txt \
fracs.txt \
> OTU_abs{abs}.txt
echo '# Simulating PCR'
SIPSim OTU_PCR \
OTU_abs{abs}.txt \
> OTU_abs{abs}_PCR.txt
echo '# Subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
--dist {subsample_dist} \
--dist_params mean:{subsample_mean},sigma:{subsample_scale} \
--min_size {subsample_min} \
--max_size {subsample_max} \
OTU_abs{abs}_PCR.txt \
> OTU_abs{abs}_PCR_sub.txt
echo '# Making a wide-formatted table'
SIPSim OTU_wideLong -w \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_w.txt
echo '# Making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_meta.txt
#-- removing large intermediate files --#
rm -f ampFrags_KDE_dif.pkl
rm -f ampFrags_KDE_dif_DBL.pkl
rm -f ampFrags_KDE_dif_DBL_inc.pkl
In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file SIPSim_pipeline.log -j 2
In [ ]:
%pushnote SIPSim pipeline complete: $buildDir
In [198]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_SIPSim-summary.sh'
bashFileTmp
Out[198]:
In [199]:
%%writefile $bashFileTmp
#!/bin/bash
# plotting 'raw' taxon abundances
SIPSimR OTU_taxonAbund \
OTU_abs{abs}.txt \
-r {topTaxaToPlot} \
-o OTU_abs{abs}
# plotting 'sequenced' taxon abundances
SIPSimR OTU_taxonAbund \
OTU_abs{abs}_PCR_sub.txt \
-r {topTaxaToPlot} \
-o OTU_abs{abs}_PCR_sub
In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file SIPSim_summary.log -j 10
In [176]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_HRSIP.sh'
bashFileTmp
Out[176]:
In [177]:
%%writefile $bashFileTmp
#!/bin/bash
# phyloseq
## making phyloseq object from OTU table
SIPSimR phyloseq_make \
OTU_abs{abs}_PCR_sub_w.txt \
-s OTU_abs{abs}_PCR_sub_meta.txt \
> OTU_abs{abs}_PCR_sub.physeq
## filtering phyloseq object to just 'heavy' fractions
SIPSimR phyloseq_edit \
OTU_abs{abs}_PCR_sub.physeq \
--BD_min {heavy_BD_min} \
--BD_max {heavy_BD_max} \
> OTU_abs{abs}_PCR_sub_filt.physeq
## making ordination
SIPSimR phyloseq_ordination \
OTU_abs{abs}_PCR_sub_filt.physeq \
OTU_abs{abs}_PCR_sub_filt_bray-NMDS.pdf
# DESeq2
SIPSimR phyloseq_DESeq2 \
--log2 {log2} \
--hypo greater \
--cont 1,3,5 \
--treat 2,4,6 \
OTU_abs{abs}_PCR_sub_filt.physeq \
> OTU_abs{abs}_PCR_sub_filt_DESeq2
In [178]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file HR-SIP.log -j 14
In [179]:
%pushnote HR-SIP complete: $buildDir
In [161]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_MWHRSIP.sh'
bashFileTmp
Out[161]:
In [162]:
%%writefile $bashFileTmp
#!/bin/bash
## HR SIP pipeline
SIPSimR phyloseq_DESeq2 \
--log2 {log2} \
--hypo greater \
--cont 1,3,5 \
--treat 2,4,6 \
--occur_all 0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5 \
-w 1.70-1.73,1.72-1.75,1.74-1.77 \
--all OTU_abs1e9_PCR_sub_MW-all.txt \
OTU_abs{abs}_PCR_sub.physeq \
> OTU_abs{abs}_PCR_sub_filt_MW_DESeq2
In [163]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file MW-HR-SIP.log -j 14
In [164]:
%pushnote MW-HR-SIP complete: $buildDir
In [105]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_qSIP.sh'
bashFileTmp
Out[105]:
In [106]:
%%writefile $bashFileTmp
#!/bin/bash
# qSIP
SIPSim qSIP \
OTU_abs{abs}.txt \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_qSIP.txt
# qSIP: atom excess
SIPSim qSIP_atomExcess \
--np {np} \
OTU_abs{abs}_PCR_sub_qSIP.txt \
{exp_design} \
> OTU_abs{abs}_PCR_sub_qSIP_atom.txt
In [ ]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file qSIP.log -j 2
In [ ]:
%pushnote qSIP complete: $buildDir
In [86]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_dBD.sh'
bashFileTmp
Out[86]:
In [87]:
%%writefile $bashFileTmp
#!/bin/bash
#deltaBD
SIPSim deltaBD \
OTU_abs{abs}_PCR_sub.txt \
{exp_design} \
> OTU_abs{abs}_PCR_sub_dBD.txt
In [89]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file deltaBD.log -j 10
In [90]:
%pushnote deltaBD complete: $buildDir
In [185]:
bashFileTmp = os.path.splitext(bashFile)[0] + '_cMtx.sh'
bashFileTmp
Out[185]:
In [186]:
%%writefile $bashFileTmp
#!/bin/bash
# HR-SIP
SIPSimR DESeq2_confuseMtx \
--libs 2,4,6 \
--padj {padj} \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_filt_DESeq2
# HR-SIP multiple 'heavy' BD windows
SIPSimR DESeq2_confuseMtx \
--libs 2,4,6 \
--padj {padj} \
-o DESeq2_multi-cMtx \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_filt_MW_DESeq2
# qSIP
SIPSimR qSIP_confuseMtx \
--libs 2,4,6 \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub_qSIP_atom.txt
# heavy-SIP
SIPSimR heavy_confuseMtx \
--libs 2,4,6 \
BD-shift_stats.txt \
OTU_abs{abs}_PCR_sub.txt
In [187]:
!chmod 777 $bashFileTmp
!cd $workDir; \
nestrun --template-file $bashFileTmp -d $buildDir --log-file cMtx.log -j 14
In [537]:
def agg_cMtx(prefix):
# all data
x = prefix + '-cMtx_data.txt'
!nestagg delim \
-d $buildDir \
-k percIncorp,percTaxa,rep \
-o $x \
--tab \
$x
# overall
x = prefix + '-cMtx_overall.txt'
!nestagg delim \
-d $buildDir \
-k percIncorp,percTaxa,rep \
-o $x \
--tab \
$x
# by class
x = prefix + '-cMtx_byClass.txt'
!nestagg delim \
-d $buildDir \
-k percIncorp,percTaxa,rep \
-o $x \
--tab \
$x
agg_cMtx('DESeq2')
agg_cMtx('DESeq2_multi')
agg_cMtx('qSIP')
agg_cMtx('heavy')
In [189]:
%pushnote atomIncorp_taxaIncorp complete!
In [774]:
F = os.path.join(buildDir, '*-cMtx_byClass.txt')
files = glob.glob(F)
files
Out[774]:
In [775]:
%%R -i files
df_byClass = list()
for (f in files){
ff = strsplit(f, '/') %>% unlist
fff = ff[length(ff)]
df_byClass[[fff]] = read.delim(f, sep='\t')
}
df_byClass = do.call(rbind, df_byClass)
df_byClass$file = gsub('\\.[0-9]+$', '', rownames(df_byClass))
df_byClass$method = gsub('-.+', '', df_byClass$file)
rownames(df_byClass) = 1:nrow(df_byClass)
df_byClass %>% head(n=3)
In [776]:
%%R
# renaming method
rename = data.frame(method = c('DESeq2', 'DESeq2_multi', 'heavy', 'qSIP'),
method_new = c('HR-SIP', 'MW-HR-SIP', 'Heavy-SIP', 'q-SIP'))
df_byClass = inner_join(df_byClass, rename, c('method'='method')) %>%
select(-method) %>%
rename('method' = method_new)
df_byClass %>% head(n=3)
In [777]:
%%R -w 800 -h 550
# summarize by SIPSim rep & library rep
df_byClass.s = df_byClass %>%
group_by(method, percIncorp, percTaxa, variables) %>%
summarize(mean_value = mean(values, na.rm=TRUE),
sd_value = sd(values, na.rm=TRUE))
# plotting
ggplot(df_byClass.s, aes(variables, mean_value, color=method,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
geom_pointrange(alpha=0.8, size=0.2) +
labs(y='Value') +
facet_grid(percTaxa ~ percIncorp) +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust=1)
)
In [778]:
%%R -w 800 -h 600
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
filter(variables %in% vars)
# plotting
ggplot(df_byClass.s.f, aes(variables, mean_value, fill=method,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
#geom_pointrange(alpha=0.8, size=0.2) +
geom_bar(stat='identity', position='dodge', width=0.8) +
geom_errorbar(stat='identity', position='dodge', width=0.8) +
scale_y_continuous(breaks=seq(0, 1, 0.2)) +
labs(y='Value') +
facet_grid(percTaxa ~ percIncorp) +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank(),
axis.text.x = element_text(angle=50, hjust=1)
)
In [779]:
%%R -w 600 -h 350
# check of qSIP
df_byClass.f = df_byClass %>%
filter(method=='q-SIP',
variables %in% c('Specificity', 'Sensitivity')) %>%
mutate(percTaxa = percTaxa %>% as.character,
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
percIncorp = percIncorp %>% as.character,
percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric))
# plotting
ggplot(df_byClass.f, aes(percIncorp, values, color=percTaxa)) +
geom_boxplot() +
scale_color_discrete('% taxa as\nincorporators') +
labs(x='13C atom % excess', y='value') +
facet_grid(variables ~ .) +
theme_bw() +
theme(
text = element_text(size=16)
)
In [780]:
%%R -w 800 -h 450
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
filter(variables %in% vars) %>%
ungroup() %>%
mutate(percTaxa = percTaxa %>% as.character,
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))
# plotting
p.pnt = ggplot(df_byClass.s.f, aes(percIncorp, mean_value,
color=percTaxa,
group=percTaxa,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
geom_point(alpha=0.8) +
geom_linerange(alpha=0.8, size=0.5) +
geom_line() +
scale_color_discrete('% incorp-\norators') +
labs(x='atom % excess 13C') +
facet_grid(variables ~ method, scale='free_y') +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.y = element_blank()
)
p.pnt
In [781]:
%%R
outFile = 'atomIncorp_taxaIncorp_acc.pdf'
ggsave(outFile, p.pnt, width=10, height=6)
cat('File written:', file.path(getwd(), outFile), '\n')
In [852]:
%%R
n_taxa = 1100
perc_incorp = 50
perc_taxa = 5
RP = n_taxa * (perc_taxa / 100)
RN = n_taxa - RP
In [853]:
%%R
# FP
df_byClass.s.f.f = df_byClass.s.f %>%
filter(variables=='Specificity',
percIncorp==perc_incorp,
percTaxa==perc_taxa) %>%
mutate(FP = RN * (1-mean_value),
FP_sd = FP * sd_value)
df_byClass.s.f.f %>% as.data.frame
In [854]:
%%R
# TP
df_byClass.s.f.f = df_byClass.s.f %>%
filter(variables=='Sensitivity',
percIncorp==perc_incorp,
percTaxa==perc_taxa) %>%
mutate(TP = RP * mean_value,
TP_sd = RP * sd_value)
df_byClass.s.f.f %>% as.data.frame
In [855]:
%%R
n_taxa = 1100
perc_incorp = 25
perc_taxa = 5
In [856]:
%%R
# FP
df_byClass.s.f.f = df_byClass.s.f %>%
filter(variables=='Specificity',
percIncorp==perc_incorp,
percTaxa==perc_taxa) %>%
mutate(FP = RN * (1-mean_value),
FP_sd = FP * sd_value)
df_byClass.s.f.f %>% as.data.frame
In [857]:
%%R
# TP
df_byClass.s.f.f = df_byClass.s.f %>%
filter(variables=='Sensitivity',
percIncorp==perc_incorp,
percTaxa==perc_taxa) %>%
mutate(TP = RP * mean_value,
TP_sd = RP * sd_value)
df_byClass.s.f.f %>% as.data.frame
In [782]:
F = os.path.join(buildDir, '*-cMtx_data.txt')
files = glob.glob(F)
files
Out[782]:
In [783]:
%%R -i files
# calling TP,FP,TN,FN
Clsfy = function(known, pred){
#print(known == TRUE)
#print(pred == TRUE)
if(known == TRUE & pred == TRUE){
return('TP')
} else
if(known == TRUE & pred == FALSE){
return('FN')
} else
if(known == FALSE & pred == FALSE){
return('TN')
} else
if(known == FALSE & pred == TRUE){
return('FP')
} else {
stop('logic error')
}
}
# loading data
df_data = list()
for (f in files){
ff = strsplit(f, '/') %>% unlist
fff = ff[length(ff)]
df_data[[fff]] = read.delim(f, sep='\t') %>%
rowwise() %>%
mutate(clsfy = Clsfy(incorp.known, incorp.pred)) %>%
dplyr::select(library, taxon, incorp.known, incorp.pred,
clsfy, percIncorp, percTaxa, rep) %>%
group_by(library, percIncorp, percTaxa, rep, clsfy) %>%
summarize(n_clsfy = n()) %>%
ungroup()
}
# combining data
df_data = do.call(rbind, df_data)
df_data$file = gsub('\\.[0-9]+$', '', rownames(df_data))
df_data$method = gsub('-.+', '', df_data$file)
rownames(df_data) = 1:nrow(df_data)
# status
df_data %>% head(n=3)
In [784]:
%%R
# renaming method
rename = data.frame(method = c('DESeq2', 'DESeq2_multi', 'heavy', 'qSIP'),
method_new = c('HR-SIP', 'MW-HR-SIP', 'Heavy-SIP', 'q-SIP'))
df_data = inner_join(df_data, rename, c('method'='method')) %>%
select(-method) %>%
rename('method' = method_new)
# status
df_data %>% head(n=3)
In [785]:
%%R -w 800 -h 250
as.Num = function(x) x %>% as.character %>% as.numeric
# summarizing
df_data_s = df_data %>%
spread(clsfy, n_clsfy, fill=0) %>%
mutate(FP = FP %>% as.Num,
TP = TP %>% as.Num,
FN = FN %>% as.Num,
FN = ifelse(is.na(FN), 0, FN),
FP = ifelse(is.na(FP), 0, FP),
TP = ifelse(is.na(TP), 0, TP))%>%
rowwise %>%
mutate(FP = ifelse(FP / (FP+TN) < 0.01, 0, FP),
FP_frac = FP / (FP+TP),
FP_frac = ifelse(is.na(FP_frac), 0, FP_frac)) %>%
group_by(method, percIncorp, percTaxa) %>%
summarize(mean_FP_frac = mean(FP_frac, na.rm=TRUE),
sd_FP_frac = sd(FP_frac, na.rm=TRUE)) %>%
ungroup() %>%
mutate(percTaxa = percTaxa %>% as.character,
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))
#df_data_s %>% tail %>% as.data.frame
y.lab = expression(frac('False incorporators', 'Total incorporators'))
# plotting
p.FP = ggplot(df_data_s, aes(percIncorp, mean_FP_frac,
color=percTaxa, group=percTaxa,
ymin=mean_FP_frac-sd_FP_frac,
ymax=mean_FP_frac+sd_FP_frac)) +
geom_point(alpha=0.8) +
geom_linerange(alpha=0.8, size=0.5) +
geom_line() +
#scale_y_continuous(limits=c(-0.16,1.00)) +
scale_color_discrete('% incorp-\norators') +
labs(x='atom % excess 13C',
y=y.lab) +
#y='false incorporators\nthat are false') +
facet_grid( ~ method, scales='free_y') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.FP
In [786]:
%%R -w 800 -h 250
# theoretical number of taxa
df.ntaxa = data.frame(
'variables' = rep('Specificity', 3),
'ntaxa' = c(500, 1100, 2000)
)
df_byClass.j = inner_join(df_byClass, df.ntaxa, c('variables'='variables')) %>%
mutate(nFP = (1-values) * ntaxa) %>%
group_by(method, percIncorp, percTaxa, ntaxa) %>%
summarize(mean_nFP = mean(nFP, na.rm=TRUE),
sd_nFP = sd(nFP, na.rm=TRUE)) %>%
ungroup() %>%
mutate(percTaxa = percTaxa %>% as.character,
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
ntaxa = ntaxa %>% as.character) %>%
filter(ntaxa == 1100)
# plotting
p.FP = ggplot(df_byClass.j, aes(percIncorp, mean_nFP,
color=percTaxa, group=percTaxa,
ymin=mean_nFP-sd_nFP,
ymax=mean_nFP+sd_nFP)) +
geom_point(alpha=0.8) +
geom_linerange(alpha=0.8, size=0.5) +
geom_line() +
scale_color_discrete('% incorp-\norators') +
labs(x='13C atom % excess',
y='False incorporators') +
facet_grid( ~ method, scales='free_y') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.FP
In [787]:
%%R
g_legend = function(a.gplot){
tmp = ggplot_gtable(ggplot_build(a.gplot))
leg = which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend = tmp$grobs[[leg]]
return(legend)
}
p.leg = g_legend(p.pnt)
In [788]:
%%R -w 800 -h 650
p.pnt.e = p.pnt + theme(legend.position='none')
p.FP.e = p.FP + theme(legend.position='none')
p.comb = cowplot::ggdraw() +
geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), color='white', fill='white') +
#cowplot::draw_plot(p.pnt.e, 0.055, 0.31, 0.84, 0.69) +
cowplot::draw_plot(p.pnt.e, 0.03, 0.31, 0.865, 0.69) +
cowplot::draw_plot(p.FP.e, 0, 0, 0.87, 0.3) +
cowplot::draw_plot(p.leg, 0.90, 0.38, 0.08, 0.3) +
cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0), c(1, 0.32))
p.comb
In [789]:
%%R
outFile = 'atomIncorp_taxaIncorp_acc-FP.pdf'
ggsave(outFile, p.comb, width=10, height=8.5)
cat('File written:', file.path(getwd(), outFile), '\n')
In [718]:
atomX_files = glob.glob('*/*/*/*_atom.txt')
len(atomX_files)
Out[718]:
In [719]:
%%R -i atomX_files
df_atomX = list()
for(F in atomX_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
tmp$percIncorp = FF[1]
tmp$percTaxa = FF[2]
tmp$rep = FF[3]
tmp$file = FF[4]
df_atomX[[F]] = tmp
}
df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3)
In [720]:
BDshift_files = glob.glob('*/*/*/BD-shift_stats.txt')
len(BDshift_files)
Out[720]:
In [721]:
%%R -i BDshift_files
df_shift = list()
for(F in BDshift_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
tmp$percIncorp = FF[1]
tmp$percTaxa = FF[2]
tmp$rep = FF[3]
tmp$file = FF[4]
df_shift[[F]] = tmp
}
df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)
df_shift = df_shift %>%
filter(library %in% c(2,4,6)) %>%
group_by(taxon, percIncorp, percTaxa, rep) %>%
summarize(median = median(median)) %>%
ungroup() %>%
rename('median_true_BD_shift' = median)
df_shift %>% head(n=3)
In [722]:
%%R
# table join
df_atomX %>% nrow %>% print
df.j = left_join(df_atomX, df_shift, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'percTaxa'='percTaxa',
'rep'='rep')) %>%
filter(!is.na(BD_diff)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess))
df.j %>% nrow %>% print
df.j %>% head(n=3)
In [723]:
%%R
# free memory
df_shift = df_atomX = NULL
# saving dataframe (if needed)
df.j.qSIP = df.j
In [724]:
%%R -h 250
# difference between true and estimated
df.j.dis.qSIP = df.j %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, percTaxa, true_incorporator) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric)) %>%
filter(true_incorporator == TRUE)
# plotting
ggplot(df.j.dis.qSIP, aes(percIncorp, mean_delta_excess,
color=percTaxa, group=percTaxa,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_pointrange(alpha=0.8, size=0.2) +
geom_line() +
scale_color_discrete('% taxa as\nincorporators') +
labs(x='13C atom % excess (truth)',
y='13C atom % excess\n(truth - estimate)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [725]:
%%R
# how much of an under-estimate?
tmp = df.j.dis.qSIP %>%
mutate(percIncorp = percIncorp %>% as.character %>% as.numeric) %>%
mutate(X = (-mean_delta_excess)/percIncorp * 100)
tmp$X %>% summary
In [726]:
%%R -w 600 -h 300
df.j.s = df.j %>%
#filter(!is.na(atom_CI_low)) %>%
mutate(pred_incorp = ifelse(atom_CI_low > 0, TRUE, FALSE),
pred_incorp = ifelse(is.na(pred_incorp), FALSE, pred_incorp),
FP = ifelse((pred_incorp==TRUE & true_incorporator==FALSE), 1, 0),
TP = ifelse((pred_incorp==TRUE & true_incorporator==TRUE), 1, 0),
FN = ifelse((pred_incorp==FALSE & true_incorporator==TRUE), 1, 0),
TN = ifelse((pred_incorp==FALSE & true_incorporator==FALSE), 1, 0)) %>%
group_by(percIncorp,percTaxa,rep) %>%
summarize(n_FP = sum(FP),
n_TP = sum(TP),
n_FN = sum(FN),
n_TN = sum(TN),
n = n(),
specificity = n_TN / (n_TN+n_FP),
sensitivity = n_TP / (n_TP+n_FN)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% as.character,
percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))
# plotting
ggplot(df.j.s, aes(percIncorp, specificity, color=percTaxa)) +
geom_boxplot() +
scale_y_continuous(limits=c(0.55, 1)) +
scale_color_discrete('% taxa as\nincorporators') +
labs(x='13C atom % excess', y='specificity') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [727]:
dBD_files = glob.glob('*/*/*/*_dBD.txt')
len(dBD_files)
Out[727]:
In [728]:
%%R -i dBD_files
df_dBD = list()
for(F in dBD_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
tmp$percIncorp = FF[1]
tmp$percTaxa = FF[2]
tmp$rep = FF[3]
tmp$file = FF[4]
df_dBD[[F]] = tmp
}
df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3)
In [729]:
BDshift_files = glob.glob('*/*/*/BD-shift_stats.txt')
len(BDshift_files)
Out[729]:
In [730]:
%%R -i BDshift_files
df_shift = list()
for(F in BDshift_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
tmp$percIncorp = FF[1]
tmp$percTaxa = FF[2]
tmp$rep = FF[3]
tmp$file = FF[4]
df_shift[[F]] = tmp
}
df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)
df_shift = df_shift %>%
filter(library %in% c(2,4,6)) %>%
group_by(taxon, percIncorp, percTaxa, rep) %>%
summarize(median = median(median)) %>%
ungroup() %>%
rename('median_true_BD_shift' = median)
df_shift %>% head(n=3)
In [731]:
%%R
df.j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'percTaxa'='percTaxa',
'rep'='rep')) %>%
filter(!is.na(delta_BD)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = delta_BD / 0.036)
df.j %>% head(n=3)
In [732]:
%%R
# free memory
df_shift = df_dBD = NULL
# saving dataframe (if needed)
df.j.dBD = df.j
In [733]:
%%R -h 250
# difference between true and estimated
df.j.dis.dBD = df.j %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, percTaxa, true_incorporator) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric)) %>%
filter(true_incorporator == TRUE)
# plotting
ggplot(df.j.dis.dBD, aes(percIncorp, mean_delta_excess,
color=percTaxa, group=percTaxa,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_pointrange(alpha=0.8, size=0.2) +
geom_line() +
scale_color_discrete('% taxa as\nincorporators') +
labs(x='13C atom % excess (truth)', y='13C atom % excess\n(truth - estimate)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [734]:
%%R
# how much of an under-estimate?
tmp = df.j.dis.dBD %>%
filter(percIncorp==100) %>%
mutate(percIncorp = percIncorp %>% as.character %>% as.numeric) %>%
mutate(X = (-mean_delta_excess)/percIncorp * 100)
tmp$X %>% summary
In [746]:
%%R -w 650 -h 250
df.jj = rbind(df.j.dis.qSIP %>% mutate(method='qSIP'),
df.j.dis.dBD %>% mutate(method='Delta BD')) %>%
mutate(method = gsub('qSIP', 'q-SIP', method))
# plotting
p.comb = ggplot(df.jj, aes(percIncorp, mean_delta_excess,
color=percTaxa, group=percTaxa,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_point() +
geom_linerange(alpha=0.5, size=1) +
geom_line() +
scale_color_discrete('% taxa as\nincorp-\norators') +
labs(x='atom % excess 13C (truth)', y='atom % excess 13C\n(truth - estimate)') +
facet_grid(. ~ method) +
theme_bw() +
theme(
text = element_text(size=16)
)
p.comb
In [747]:
%%R
df.jj = rbind(df.j.qSIP %>%
dplyr::select(atom_fraction_excess, true_atom_fraction_excess, percIncorp,
true_incorporator, percTaxa) %>%
mutate(method='qSIP'),
df.j.dBD %>%
dplyr::select(atom_fraction_excess, true_atom_fraction_excess, percIncorp,
true_incorporator, percTaxa) %>%
mutate(method='Delta BD')) %>%
filter(percTaxa == '10',
true_incorporator==TRUE,
atom_fraction_excess!=0) %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
atom_perc_excess = atom_fraction_excess * 100,
true_atom_perc_excess = true_atom_fraction_excess * 100,
method = gsub('qSIP', 'q-SIP', method))
df.jj %>% head(n=3)
In [753]:
%%R -w 700 -h 300
tmp = df.jj %>%
dplyr::select(true_atom_perc_excess, percIncorp, method) %>%
distinct(percIncorp, method)
p.dens = ggplot(df.jj, aes(atom_perc_excess)) +
geom_density(fill='grey70') +
geom_vline(data=tmp, aes(xintercept=true_atom_perc_excess), linetype='dashed', alpha=0.5) +
scale_x_continuous(limits=c(-20, 120)) +
labs(x='atom % excess 13C (estimate)', y='Probability density') +
facet_grid(method ~ percIncorp) +
theme_bw() +
theme(
text = element_text(size=16)
)
p.dens
In [757]:
%%R -w 700 -h 500
p.comb2 = cowplot::ggdraw() +
geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), color='white', fill='white') +
cowplot::draw_plot(p.comb, 0.02, 0.61, 0.98, 0.38) +
cowplot::draw_plot(p.dens, 0, 0, 0.98, 0.60) +
cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0), c(1, 0.61))
p.comb2
In [758]:
%%R
outFile = 'atomIncorp_taxaIncorp_qSIP-dBD.pdf'
ggsave(outFile, p.comb2, width=10, height=7.15)
cat('File written:', file.path(getwd(), outFile), '\n')
In [93]:
import glob
In [194]:
def find_missing_files(filepath, empty_cut=10000):
# directories
dirpath = os.path.split(filepath)[0]
chk = os.path.join(buildDir, dirpath) + '/'
D = set([os.path.split(x)[0] for x in glob.glob(chk)])
# files
chk = os.path.join(buildDir, filepath)
F = glob.glob(chk)
# check for missing files
Fd = set([os.path.split(f)[0] for f in F])
missing = D - Fd
print 'Union length: {}'.format(len(D | Fd))
print 'Number of missing: {}'.format(len(missing))
# check for empty files
empties = [os.path.split(f)[0] for f in F if os.path.getsize(f) < empty_cut]
print 'Number of empties: {}'.format(len(empties))
# ret
return {'missing' : list(missing), 'empty' : empties}
find_missing_files('*/*/*/OTU_abs1e9.txt')
Out[194]:
In [195]:
pipeline_me = find_missing_files('*/*/*/OTU_abs1e9.txt')
In [196]:
pipeline_me
Out[196]:
In [197]:
# rerunning
exe = '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/SIPSimRun_SIPSim-pipeline.sh'
exe += ' 2> /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/SIPSim_pipeline.log'
!$exe
%pushnote SIPSim pipeline rerun complete
In [205]:
exe = '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/SIPSimRun_SIPSim-summary.sh'
exe += ' 2> /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/SIPSim_summary.log'
!$exe
%pushnote pipeline summary rerun complete
In [202]:
# exe = '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/SIPSimRun_HRSIP.sh'
# exe += ' 2> /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/100/25/6/HR-SIP.log'
# !$exe
# %pushnote HRSIP rerun complete
In [203]:
pipeline_me = find_missing_files('*/*/*/OTU_abs1e9.txt')
In [231]:
HRSIP_me = find_missing_files('*/*/*/OTU_abs1e9_PCR_sub_filt_DESeq2')
In [238]:
def run_HRSIP(dirpath):
cmd = 'cd {}; {} 2> {}'.format(dirpath, './SIPSimRun_HRSIP.sh', 'HR-SIP.log')
!$cmd
for FL in HRSIP_me.values():
for D in FL:
print 'Processing: {}'.format(D)
run_HRSIP(D)
In [239]:
HRSIP_me = find_missing_files('*/*/*/OTU_abs1e9_PCR_sub_filt_DESeq2')
In [212]:
qSIP_me = find_missing_files('*/*/*/*qSIP_atom.txt')
In [ ]:
def run_qSIP(dirpath):
#cmd = os.path.join(dirpath, 'SIPSimRun_qSIP.sh')
#log = os.path.join(dirpath, 'qSIP.log')
cmd = 'cd {}; perl -pi -e "s/--np 10/--np 20/" {}'.format(dirpath, 'SIPSimRun_qSIP.sh')
!$cmd
cmd = 'cd {}; {} 2> {}'.format(dirpath, './SIPSimRun_qSIP.sh', 'qSIP.log')
!$cmd
for FL in qSIP_me.values():
for D in FL:
print 'Processing: {}'.format(D)
run_qSIP(D)
In [ ]:
%pushnote qSIP rerun complete
In [228]:
cMtx_me = find_missing_files('*/*/*/qSIP-cMtx_overall.txt', empty_cut=200)
In [242]:
def run_cMtx(dirpath):
cmd = 'cd {}; {} 2> {}'.format(dirpath, './SIPSimRun_cMtx.sh', 'cMtx.log')
!$cmd
for FL in cMtx_me.values():
for D in FL:
print 'Processing: {}'.format(D)
run_cMtx(D)
In [243]:
cMtx_me = find_missing_files('*/*/*/qSIP-cMtx_overall.txt', empty_cut=200)
In [244]:
%pushnote cMtx rerun complete
In [ ]:
In [ ]: