Question: how is incorporator identification accuracy affected by the percent isotope incorporation of taxa?
Using genome dataset created in the "dataset" notebook
Simulates isotope dilution or short incubations
In [9]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/'
buildDir = os.path.join(workDir, 'percIncorpUnif')
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
In [4]:
import glob
from os.path import abspath
import nestly
from IPython.display import Image, display
In [5]:
%load_ext rpy2.ipython
In [6]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [7]:
if not os.path.isdir(buildDir):
os.makedirs(buildDir)
In [169]:
!cd $buildDir; \
SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fr ../../515F-806R.fna \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np 24 \
2> ampFrags.log \
> ampFrags.pkl
In [170]:
!cd $buildDir; \
SIPSim fragment_kde \
ampFrags.pkl \
> ampFrags_kde.pkl
In [172]:
!cd $buildDir; \
SIPSim diffusion \
ampFrags_kde.pkl \
--np 24 \
> ampFrags_kde_dif.pkl
In [173]:
!cd $buildDir; \
SIPSim communities \
$genomeDir/genome_index.txt \
--n_comm 2 \
> comm.txt
In [192]:
# building tree structure
nest = nestly.Nest()
## varying params
nest.add('percIncorp', range(0,101,20))
## set params
nest.add('np_many', [24], create_dir=False)
nest.add('np_few', [8], create_dir=False)
nest.add('percTaxa', [25], create_dir=False)
nest.add('abs', ['1e10'], create_dir=False)
#nest.add('subsample', [20000], create_dir=False)
nest.add('subsample_mean', [30000], create_dir=False)
nest.add('subsample_scale', [5000], create_dir=False)
nest.add('BD_min', [1.71], create_dir=False)
nest.add('BD_max', [1.78], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
## input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('frag_file', ['ampFrags_kde_dif'], create_dir=False)
nest.add('comm_file', ['comm.txt'], create_dir=False)
nest.add('genome_index', [os.path.join(genomeDir, 'genome_index.txt')], create_dir=False)
nest.add('genome_dir', [genomeDir], create_dir=False)
nest.add('primers', [os.path.join(workDir, '../', '515F-806R.fna')], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
# building directory tree
nest.build(buildDir)
In [193]:
bashFile = os.path.join(workDir, 'SIPSimRun.sh')
In [194]:
%%writefile $bashFile
#!/bin/bash
# symlinking input files
ln -s {buildDir}/{frag_file}.pkl {frag_file}.pkl
ln -s {buildDir}/{comm_file} {comm_file}
# making incorp file
SIPSim incorpConfigExample \
--percTaxa {percTaxa} \
--percIncorpUnif {percIncorp} \
> {percTaxa}_{percIncorp}.config
# adding isotope incorporation to BD distribution
SIPSim isotope_incorp \
{frag_file}.pkl \
{percTaxa}_{percIncorp}.config \
--comm {comm_file} \
--np {np_many} \
> {frag_file}_incorp.pkl
# calculating BD shift from isotope incorporation
SIPSim BD_shift \
{frag_file}.pkl \
{frag_file}_incorp.pkl \
--np {np_few} \
> {frag_file}_incorp_BD-shift.txt
# simulating gradient fractions
SIPSim gradient_fractions \
{comm_file} \
> fracs.txt
# simulating an OTU table
SIPSim OTU_table \
{frag_file}_incorp.pkl \
{comm_file} \
fracs.txt \
--abs {abs} \
--np {np_few} \
> OTU_abs{abs}.txt
# subsampling from the OTU table (simulating sequencing of the DNA pool)
SIPSim OTU_subsample \
--dist normal \
--dist_params loc:{subsample_mean},scale:{subsample_scale} \
OTU_abs{abs}.txt \
> OTU_n2_abs{abs}_sub-norm.txt
# making a wide table
SIPSim OTU_wideLong -w \
OTU_n2_abs{abs}_sub-norm.txt \
> OTU_n2_abs{abs}_sub-norm_w.txt
# making metadata (phyloseq: sample_data)
SIPSim OTU_sampleData \
OTU_n2_abs{abs}_sub-norm.txt \
> OTU_n2_abs{abs}_sub-norm_meta.txt
In [195]:
!chmod 775 $bashFile
In [ ]:
!cd $workDir; \
nestrun -j 1 --template-file $bashFile -d percIncorpUnif --log-file log.txt
In [ ]:
%%writefile $bashFile
#!/bin/bash
#-- R analysis --#
export PATH={R_dir}:$PATH
# plotting taxon abundances
OTU_taxonAbund.r \
OTU_n2_abs{abs}.txt \
-r {topTaxaToPlot} \
-o OTU_n2_abs{abs}
# plotting taxon abundances
OTU_taxonAbund.r \
OTU_n2_abs{abs}_sub-norm.txt \
-r {topTaxaToPlot} \
-o OTU_n2_abs{abs}_subsub-norm
# running DeSeq2 and making confusion matrix on predicting incorporators
## making phyloseq object from OTU table
phyloseq_make.r \
OTU_n2_abs{abs}_sub-norm_w.txt \
-s OTU_n2_abs{abs}_sub-norm_meta.txt \
> OTU_n2_abs{abs}_sub-norm.physeq
## filtering phyloseq object to just taxa/samples of interest
phyloseq_edit.r \
OTU_n2_abs{abs}_sub-norm.physeq \
--BD_min {BD_min} --BD_max {BD_max} \
> OTU_n2_abs{abs}_sub-norm_filt.physeq
## making ordination
phyloseq_ordination.r \
OTU_n2_abs{abs}_sub-norm_filt.physeq \
OTU_n2_abs{abs}_sub-norm_bray-NMDS.pdf
## DESeq2
phyloseq_DESeq2.r \
OTU_n2_abs{abs}_sub-norm_filt.physeq \
--log2 {log2} \
--hypo greater \
> OTU_n2_abs{abs}_sub-norm_DESeq2
## Confusion matrix
DESeq2_confuseMtx.r \
{frag_file}_incorp_BD-shift.txt \
OTU_n2_abs{abs}_sub-norm_DESeq2 \
--padj {padj}
In [219]:
!chmod 775 $bashFile
In [220]:
!cd $workDir; \
nestrun -j 20 --template-file $bashFile -d percIncorpUnif --log-file log.txt
In [ ]:
# aggregating confusion matrix data
## table
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_table.csv \
DESeq2-cMtx_table.csv
## overall
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_overall.csv \
DESeq2-cMtx_overall.csv
## byClass
!cd $workDir; \
nestagg delim \
-d percIncorpUnif \
-k percIncorp \
-o ./percIncorpUnif/DESeq2-cMtx_byClass.csv \
DESeq2-cMtx_byClass.csv
In [222]:
x = os.path.join(workDir, 'percIncorpUnif', '*', 'OTU*abund.pdf')
abundFiles = glob.glob(x)
for x in abundFiles:
base = os.path.splitext(x)[0]
!cd $buildDir; convert $base\.pdf $base\.png
In [223]:
x = os.path.join(workDir, 'percIncorpUnif', '*', 'OTU_abs*abund.png')
abundFiles = glob.glob(x)
for x in sorted(abundFiles):
print x
img = Image(x)
display(img)
In [208]:
x = os.path.join(workDir, 'percIncorpUnif', '*', 'OTU*_sub*abund.png')
abundFiles = glob.glob(x)
for x in sorted(abundFiles):
print x
img = Image(x)
display(img)
In [224]:
%%R -i workDir -w 600 -h 400
setwd(workDir)
byClass = read.csv('./percIncorpUnif/DESeq2-cMtx_byClass.csv')
byClass$byClass[is.na(byClass$byClass)] = 0
byClass$percIncorp = factor(byClass$percIncorp,
levels=as.character(unique(sort(byClass$percIncorp))))
p = ggplot(byClass, aes(X, byClass, fill=percIncorp)) +
geom_bar(stat='identity', position='dodge', color='black') +
labs(y='value') +
theme_bw() +
theme(
text=element_text(size=16),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x=element_blank()
)
p
Conclusions:
In [12]:
%%R -i workDir
setwd(workDir)
tbl.c = read.csv('percIncorpUnif/100/DESeq2-cMtx_data.csv')
tbl.otu = read.delim('percIncorpUnif/100/OTU_n2_abs1e10_sub-norm.txt', sep='\t')
In [13]:
%%R
# OTU total counts
tbl.otu.sum = tbl.otu %>%
group_by(library, taxon) %>%
summarize(total_count = sum(count))
tbl.otu.sum %>% head
In [14]:
%%R
#
label.tp.fn = function(known, pred){
if(is.na(known) | is.na(pred)){
return(NA)
} else
if(known==TRUE & pred==TRUE){
return('TP')
} else
if(known==TRUE & pred==FALSE){
return('FN')
} else {
return(NA)
}
}
tbl.c.tp.fn = tbl.c %>%
mutate(tp.fn = mapply(label.tp.fn, incorp.known, incorp.pred)) %>%
filter(! is.na(tp.fn))
tbl.tp.fn = inner_join(tbl.c.tp.fn, tbl.otu.sum, c('taxon' = 'taxon'))
tbl.tp.fn %>% head
In [15]:
%%R
# how many TP & FN?
tbl.tp.fn %>%
group_by(library, tp.fn) %>%
summarize(n = n())
In [229]:
%%R -h 350
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(library, total_count, color=tp.fn)) +
geom_boxplot() +
labs(y='Total count') +
theme(
text = element_text(size=18)
)
In [230]:
%%R -h 700 -w 900
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(taxon, total_count, group=taxon, color=library)) +
geom_point(size=3) +
geom_line() +
scale_y_log10() +
labs(y='Total count') +
facet_grid(. ~ tp.fn, scales='free_x') +
theme(
text = element_text(size=18),
axis.text.x = element_text(angle=90, hjust=1)
)
Notes:
In [235]:
%%R
# OTU total counts
BD_min.cut = 1.71
BD_max.cut = 1.78
tbl.otu.sum = tbl.otu %>%
filter(! grepl('inf', fraction)) %>%
separate(fraction, into = c('BD_min','BD_max'), sep='-', convert=TRUE) %>%
filter(BD_min >= BD_min.cut & BD_max <= BD_max.cut) %>%
group_by(library, taxon) %>%
summarize(total_count = sum(count))
tbl.otu.sum %>% head
In [236]:
%%R
tbl.tp.fn = inner_join(tbl.c.tp.fn, tbl.otu.sum, c('taxon' = 'taxon'))
tbl.tp.fn %>% head
In [237]:
%%R -h 350
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(library, total_count, color=tp.fn)) +
geom_boxplot() +
labs(y='Total count') +
theme(
text = element_text(size=18)
)
In [238]:
%%R -h 700 -w 900
tbl.tp.fn$library = as.character(tbl.tp.fn$library)
ggplot(tbl.tp.fn, aes(taxon, total_count, group=taxon, color=library)) +
geom_point(size=3) +
geom_line() +
scale_y_log10() +
labs(y='Total count') +
facet_grid(. ~ tp.fn, scales='free_x') +
theme(
text = element_text(size=18),
axis.text.x = element_text(angle=90, hjust=1)
)
Notes:
In [16]:
%%R -i workDir
setwd(workDir)
tbl.f = tbl.tp.fn %>%
filter(tp.fn == 'TP') %>%
select(taxon) %>%
distinct()
#write.csv(tbl.f, 'percIncorpUnif/100/TP.txt', sep='\t', row.names=F, col.names=F, quote=F)
tbl.f %>% head
In [17]:
%%R
tbl.otu = read.delim('percIncorpUnif/100/OTU_n2_abs1e10_sub-norm.txt', sep='\t')
tbl.otu.f = tbl.otu %>%
filter(taxon %in% tbl.f$taxon)
tbl.otu.f %>% nrow %>% print
tbl.otu.f %>% head
In [18]:
%%R
## BD min/max/mid
tbl.otu.f = tbl.otu.f %>%
mutate(fraction = gsub('^-inf','negInf',fraction)) %>%
separate(fraction, into = c('BD_min','BD_max'), sep='-',
convert=TRUE, remove=FALSE) %>%
mutate(BD_min = as.numeric(gsub('negInf','-inf',BD_min)),
BD_max = as.numeric(gsub('negInf','-inf',BD_max)))
calc_BD_mid = function(BD_min, BD_max){
if(is.infinite(BD_min)){
return(BD_max)
} else
if(is.infinite(BD_max)){
return(BD_min)
} else {
return((BD_max - BD_min) / 2 + BD_min)
}
stop('LOGIC error')
}
tbl.otu.f$BD_mid = mapply(calc_BD_mid,tbl.otu.f$BD_min, tbl.otu.f$BD_max)
calc_BD_width = function(BD_min, BD_max){
if(is.infinite(BD_min)){
return(0.001)
} else
if(is.infinite(BD_max)){
return(0.001)
} else {
return(BD_max - BD_min)
}
stop('LOGIC error')
}
tbl.otu.f$BD_width = mapply(calc_BD_width,tbl.otu.f$BD_min, tbl.otu.f$BD_max)
tbl.otu.f %>% head
In [19]:
%%R
tbl.s = tbl.otu.f %>%
group_by(library, BD_mid) %>%
summarize(total_sample_count = sum(count))
tbl.otu.f = inner_join(tbl.otu.f, tbl.s, c('library'='library','BD_mid'='BD_mid'))
tbl.otu.f = tbl.otu.f %>%
mutate(rel_count = count / total_sample_count)
tbl.otu.f %>% head
In [24]:
%%R -w 700 -h 400
make_frac_plot = function(tbl, BD.GCp0, BD.GCp100, rel=FALSE, legend=FALSE){
p = ggplot(tbl, aes(x=BD_mid, fill=taxon))
if(rel==TRUE){
p = p + geom_area(aes(y=rel_count), stat='identity', alpha=1, position='stack') +
labs(y='Relative abundance')
} else {
p = p + geom_area(aes(y=count), stat='identity', alpha=0.5, position='dodge') +
labs(y='Absolute abundance')
}
p = p + geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
scale_x_continuous(expand=c(0.01,0)) +
scale_y_continuous(expand=c(0,0.01)) +
facet_grid(library ~ .) +
theme_bw()
if(legend==TRUE){
p = p + theme(
text = element_text(size=16)
)
} else {
p = p + theme(
text = element_text(size=16),
legend.position = 'none'
)
}
return(p)
}
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
p.dodge = make_frac_plot(tbl.otu.f, BD.GCp0, BD.GCp100)
p.fill = make_frac_plot(tbl.otu.f, BD.GCp0, BD.GCp100, rel=TRUE)
print(p.dodge)
print(p.fill)
In [ ]:
In [ ]:
In [73]:
# building tree structure
from os.path import abspath
nest = nestly.Nest()
## values
vals = [str(x) for x in range(1,5)]
nest.add('vals', vals)
## input files
nest.add('--np', [1], create_dir=False)
buildDir = '/home/nick/t/nestly/' #os.path.join(workDir, 'vals')
nest.build(buildDir)
In [74]:
%%writefile /home/nick/t/example.sh
#!/bin/bash
export TIME='elapsed,maxmem,exitstatus\n%e,%M,%x'
echo {--np} > {--np}_test.txt
In [75]:
!cd /home/nick/t/; \
chmod 777 example.sh
In [76]:
!cd /home/nick/t/; \
nestrun -j 2 --template-file example.sh -d nestly
In [ ]:
In [ ]: