Question: how is incorporator identification accuracy affected by the percent isotope incorporation of taxa?
Using genome dataset created in the "dataset" notebook
Simulates isotope dilution or short incubations
In [37]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
In [38]:
import glob
from os.path import abspath
import nestly
In [39]:
%load_ext rpy2.ipython
In [40]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [118]:
# building tree structure
nest = nestly.Nest()
## varying params
nest.add('percIncorp', range(0,101,20))
## set params
nest.add('np_many', [24], create_dir=False)
nest.add('np_few', [8], create_dir=False)
nest.add('percTaxa', [25], create_dir=False)
nest.add('abs', ['1e10'], create_dir=False)
nest.add('subsample', [20000], create_dir=False)
nest.add('BD_min', [1.71], create_dir=False)
nest.add('BD_max', [2], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
## input/output files
nest.add('fileName', ['ampFrags'], create_dir=False)
nest.add('genome_index', [os.path.join(genomeDir, 'genome_index.txt')], create_dir=False)
nest.add('genome_dir', [genomeDir], create_dir=False)
nest.add('primers', [os.path.join(workDir, '../', '515F-806R.fna')], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
# building directory tree
buildDir = os.path.join(workDir, 'percIncorpUnif')
nest.build(buildDir)
In [119]:
bashFile = os.path.join(workDir, 'SIPSimRun.sh')
In [120]:
%%writefile $bashFile
#!/bin/bash
# simulating fragments
SIPSim fragments \
{genome_index} \
--fp {genome_dir} \
--fr {primers} \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np {np_many} \
2> {fileName}.log \
> {fileName}.pkl
# converting to kde object
SIPSim fragment_kde \
{fileName}.pkl \
> {fileName}_kde.pkl
# adding diffusion
SIPSim diffusion \
{fileName}_kde.pkl \
--np {np_many} \
> {fileName}_kde_dif.pkl
# creating a community file
SIPSim gradientComms \
{genome_index} \
--n_comm 2 \
> comm.txt
# making incorp file
SIPSim incorpConfigExample \
--percTaxa {percTaxa} \
--percIncorpUnif {percIncorp} \
> {percTaxa}_{percIncorp}.config
# adding isotope incorporation to BD distribution
SIPSim isoIncorp \
{fileName}_kde_dif.pkl \
{percTaxa}_{percIncorp}.config \
--comm comm.txt \
--np {np_many} \
> {fileName}_kde_dif_incorp.pkl
# calculating BD shift from isotope incorporation
SIPSim BD_shift \
{fileName}_kde_dif.pkl \
{fileName}_kde_dif_incorp.pkl \
--np {np_few} \
> {fileName}_kde_dif_incorp_BD-shift.txt
# simulating gradient fractions
SIPSim fractions \
comm.txt \
> fracs.txt
# simulating an OTU table
SIPSim OTU_table \
{fileName}_kde_dif_incorp.pkl \
comm.txt \
fracs.txt \
--abs {abs} \
--np {np_few} \
> OTU_abs{abs}.txt
# subsampling from the OTU table (simulating sequencing of the DNA pool)
SIPSim OTU_subsample \
--dist_params low:{subsample},high:{subsample} \
OTU_abs{abs}.txt \
> OTU_n2_abs{abs}_sub{subsample}.txt
# making a wide table
SIPSim OTU_wideLong -w \
OTU_n2_abs{abs}_sub{subsample}.txt \
> OTU_n2_abs{abs}_sub{subsample}_w.txt
# making metadata (phyloseq: sample_data)
SIPSim OTU_sampleData \
OTU_n2_abs{abs}_sub{subsample}.txt \
> OTU_n2_abs{abs}_sub{subsample}_meta.txt
#-- R analysis --#
export PATH={R_dir}:$PATH
# plotting taxon abundances
OTU_taxonAbund.r \
OTU_n2_abs{abs}_sub{subsample}.txt \
-r {topTaxaToPlot} \
-o OTU_n2_abs{abs}_sub{subsample}
# running DeSeq2 and making confusion matrix on predicting incorporators
## making phyloseq object from OTU table
phyloseq_make.r \
OTU_n2_abs{abs}_sub{subsample}_w.txt \
-s OTU_n2_abs{abs}_sub{subsample}_meta.txt \
> OTU_n2_abs{abs}_sub{subsample}.physeq
## filtering phyloseq object to just taxa/samples of interest
phyloseq_edit.r \
OTU_n2_abs{abs}_sub{subsample}.physeq \
--BD_min {BD_min} --BD_max {BD_max} \
> OTU_n2_abs{abs}_sub{subsample}_filt.physeq
## making ordination
phyloseq_ordination.r \
OTU_n2_abs{abs}_sub{subsample}_filt.physeq \
OTU_n2_abs{abs}_sub{subsample}_bray-NMDS.pdf
## DESeq2
phyloseq_DESeq2.r \
OTU_n2_abs{abs}_sub{subsample}_filt.physeq \
> OTU_n2_abs{abs}_sub{subsample}_DESeq2
## Confusion matrix
DESeq2_confuseMtx.r \
{fileName}_kde_dif_incorp_BD-shift.txt \
OTU_n2_abs{abs}_sub{subsample}_DESeq2 \
--padj {padj} --log2 {log2}
In [122]:
%%writefile $bashFile
#!/bin/bash
## just R analysis
#-- R analysis --#
export PATH={R_dir}:$PATH
# plotting taxon abundances
OTU_taxonAbund.r \
OTU_n2_abs{abs}_sub{subsample}.txt \
-r {topTaxaToPlot} \
-o OTU_n2_abs{abs}_sub{subsample}
# running DeSeq2 and making confusion matrix on predicting incorporators
## making phyloseq object from OTU table
phyloseq_make.r \
OTU_n2_abs{abs}_sub{subsample}_w.txt \
-s OTU_n2_abs{abs}_sub{subsample}_meta.txt \
> OTU_n2_abs{abs}_sub{subsample}.physeq
## filtering phyloseq object to just taxa/samples of interest
phyloseq_edit.r \
OTU_n2_abs{abs}_sub{subsample}.physeq \
--BD_min {BD_min} --BD_max {BD_max} \
> OTU_n2_abs{abs}_sub{subsample}_filt.physeq
## making ordination
phyloseq_ordination.r \
OTU_n2_abs{abs}_sub{subsample}_filt.physeq \
OTU_n2_abs{abs}_sub{subsample}_bray-NMDS.pdf
## DESeq2
phyloseq_DESeq2.r \
OTU_n2_abs{abs}_sub{subsample}_filt.physeq \
--log2 {log2} \
> OTU_n2_abs{abs}_sub{subsample}_DESeq2
## Confusion matrix
DESeq2_confuseMtx.r \
{fileName}_kde_dif_incorp_BD-shift.txt \
OTU_n2_abs{abs}_sub{subsample}_DESeq2 \
--padjBH {padj}
In [123]:
!chmod 775 $bashFile
In [124]:
!cd $workDir; \
nestrun -j 6 --template-file $bashFile -d percIncorpUnif --log-file logR.txt
In [125]:
# aggregating confusion matrix data
## table
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_table.csv \
DESeq2-cMtx_table.csv
## overall
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_overall.csv \
DESeq2-cMtx_overall.csv
## byClass
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_byClass.csv \
DESeq2-cMtx_byClass.csv
In [127]:
%%R -i workDir -w 600 -h 600
setwd(workDir)
byClass = read.csv('./percIncorpUnif/DESeq2-cMtx_byClass.csv')
byClass$byClass[is.na(byClass$byClass)] = 0
byClass$percIncorp = factor(byClass$percIncorp, levels=as.character(unique(sort(byClass$percIncorp))))
p = ggplot(byClass, aes(X, byClass, color=percIncorp)) +
geom_point() +
labs(y='value') +
theme_bw() +
theme(
text=element_text(size=16),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x=element_blank()
)
p
In [130]:
%%R -w 600 -h 400
p + scale_y_continuous(limits=c(0,0.15))
In [243]:
%%R -i workDir
setwd(workDir)
tbl.c = read.csv('percIncorpUnif/50/DESeq2-cMtx_data.csv')
tbl.c.f = tbl.c %>%
filter(incorp.known==TRUE & incorp.pred==FALSE) %>%
select(taxon)
write.table(tbl.c.f, file='percIncorpUnif/50/DESeq2-cMtx_FN', row.names=F, col.names=F, quote=F)
In [250]:
!cd $workDir'percIncorpUnif/50/'; \
$R_dir/OTU_taxonAbund.r OTU_n2_abs1e8_sub30000.txt -t DESeq2-cMtx_FN -l -o DESeq2-cMtx_FN --width 14
In [252]:
%%R -i workDir
setwd(workDir)
tbl.c = read.csv('percIncorpUnif/50/DESeq2-cMtx_data.csv')
tbl.c.f = tbl.c %>%
filter(incorp.known==FALSE & incorp.pred==TRUE) %>%
select(taxon)
write.table(tbl.c.f, file='percIncorpUnif/50/DESeq2-cMtx_FP', row.names=F, col.names=F, quote=F)
In [253]:
!cd $workDir'percIncorpUnif/50/'; \
$R_dir/OTU_taxonAbund.r OTU_n2_abs1e8_sub30000.txt -t DESeq2-cMtx_FP -l -o DESeq2-cMtx_FP --width 14
In [256]:
%%R
tbl.c %>% filter(taxon %in% tbl.c.f$taxon)
In [200]:
%%R -i workDir
setwd(workDir)
tbl.c = read.csv('percIncorpUnif/100/DESeq2-cMtx_data.csv')
tbl.otu = read.delim('percIncorpUnif/100/OTU_n2_abs1e10_sub20000.txt', sep='\t')
In [201]:
%%R
# OTU total counts
tbl.otu.sum = tbl.otu %>%
group_by(library, taxon) %>%
summarize(total_count = sum(count))
tbl.otu.sum %>% head
In [202]:
%%R
#
label.tp.fn = function(known, pred){
if(is.na(known) | is.na(pred)){
return(NA)
} else
if(known==TRUE & pred==TRUE){
return('TP')
} else
if(known==TRUE & pred==FALSE){
return('FN')
} else {
return(NA)
}
}
tbl.c.tp.fn = tbl.c %>%
mutate(tp.fn = mapply(label.tp.fn, incorp.known, incorp.pred)) %>%
filter(! is.na(tp.fn))
tbl.tp.fn = inner_join(tbl.c.tp.fn, tbl.otu.sum, c('taxon' = 'taxon'))
tbl.tp.fn %>% head
In [203]:
%%R
# how many TP & FN?
tbl.tp.fn %>%
group_by(library, tp.fn) %>%
summarize(n = n())
In [204]:
%%R -h 350
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(library, total_count, color=tp.fn)) +
geom_boxplot() +
labs(y='Total count') +
theme(
text = element_text(size=18)
)
In [209]:
%%R -h 700 -w 900
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(taxon, total_count, group=taxon, color=library)) +
geom_point(size=3) +
geom_line() +
scale_y_log10() +
labs(y='Total count') +
facet_grid(. ~ tp.fn, scales='free_x') +
theme(
text = element_text(size=18),
axis.text.x = element_text(angle=90, hjust=1)
)
Notes:
In [210]:
%%R
# OTU total counts
heavy.cut = 1.71
tbl.otu.sum = tbl.otu %>%
filter(! grepl('inf', fraction)) %>%
separate(fraction, into = c('BD_min','BD_max'), sep='-', convert=TRUE) %>%
filter(BD_min >= heavy.cut & BD_max <= 2) %>%
group_by(library, taxon) %>%
summarize(total_count = sum(count))
tbl.otu.sum %>% head
In [211]:
%%R
tbl.tp.fn = inner_join(tbl.c.tp.fn, tbl.otu.sum, c('taxon' = 'taxon'))
tbl.tp.fn %>% head
In [212]:
%%R -h 350
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(library, total_count, color=tp.fn)) +
geom_boxplot() +
labs(y='Total count') +
theme(
text = element_text(size=18)
)
In [215]:
%%R -h 700 -w 900
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(taxon, total_count, group=taxon, color=library)) +
geom_point(size=3) +
geom_line() +
scale_y_log10() +
labs(y='Total count') +
facet_grid(. ~ tp.fn, scales='free_x') +
theme(
text = element_text(size=18),
axis.text.x = element_text(angle=90, hjust=1)
)
Notes:
In [73]:
# building tree structure
from os.path import abspath
nest = nestly.Nest()
## values
vals = [str(x) for x in range(1,5)]
nest.add('vals', vals)
## input files
nest.add('--np', [1], create_dir=False)
buildDir = '/home/nick/t/nestly/' #os.path.join(workDir, 'vals')
nest.build(buildDir)
In [74]:
%%writefile /home/nick/t/example.sh
#!/bin/bash
export TIME='elapsed,maxmem,exitstatus\n%e,%M,%x'
echo {--np} > {--np}_test.txt
In [75]:
!cd /home/nick/t/; \
chmod 777 example.sh
In [76]:
!cd /home/nick/t/; \
nestrun -j 2 --template-file example.sh -d nestly
In [ ]:
In [ ]: