In [5]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/validation/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figureDir = '/home/nick/notebook/SIPSim/figures/'
In [6]:
import glob
from os.path import abspath
import nestly
from IPython.display import Image
In [7]:
import os
%load_ext rpy2.ipython
In [8]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [9]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [30]:
!cd $workDir; \
SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fr ../../515F-806R.fna \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np 24 \
2> ampFrags.log \
> ampFrags.pkl
In [43]:
!cd $workDir; \
grep "Number of amplicons: " ampFrags.log | \
perl -pe 's/.+ +//' | hist
In [45]:
!cd $workDir; \
SIPSim fragment_KDE \
ampFrags.pkl \
> ampFrags_kde.pkl
In [46]:
!cd $workDir; \
SIPSim KDE_info -s ampFrags_kde.pkl \
> ampFrags_kde_info.txt
In [65]:
%%R -i workDir -w 600 -h 300
# loading
inFile = file.path(workDir, 'ampFrags_kde_info.txt')
df = read.delim(inFile, sep='\t')
df.kde1 = df %>%
filter(KDE_ID == 1)
df.kde1 %>% head(n=3)
BD_GC50 = 0.098 * 0.5 + 1.66
In [70]:
%%R -w 600 -h 300
# plotting
p.amp = ggplot(df.kde1, aes(median)) +
geom_histogram(binwidth=0.001) +
geom_vline(xintercept=BD_GC50, linetype='dashed', color='red', alpha=0.7) +
labs(x='Median buoyant density') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.amp
In [60]:
!cd $workDir; \
SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np 24 \
> shotFrags.pkl \
2> shotFrags.log
In [61]:
!cd $workDir; \
SIPSim fragment_KDE \
shotFrags.pkl \
> shotFrags_kde.pkl
In [62]:
!cd $workDir; \
SIPSim KDE_info -s shotFrags_kde.pkl \
> shotFrags_kde_info.txt
In [67]:
%%R -i workDir -w 600 -h 300
# loading
inFile = file.path(workDir, 'shotFrags_kde_info.txt')
df = read.delim(inFile, sep='\t')
df.kde1 = df %>%
filter(KDE_ID == 1)
df.kde1 %>% head(n=3)
BD_GC50 = 0.098 * 0.5 + 1.66
In [71]:
%%R -w 600 -h 300
# plotting
p.shot = ggplot(df.kde1, aes(median)) +
geom_histogram(binwidth=0.001) +
geom_vline(xintercept=BD_GC50, linetype='dashed', color='red', alpha=0.7) +
labs(x='Median buoyant density') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.shot
In [69]:
%%R
grid.arrange(p.amp, p.shot, ncol=1)
In [73]:
!cd $workDir; \
SIPSim diffusion \
ampFrags_kde.pkl \
--np 24 \
> ampFrags_kde_dif.pkl \
2> ampFrags_kde_dif.log
In [378]:
!cd $workDir; \
SIPSim DBL \
ampFrags_kde_dif.pkl \
--np 24 \
> ampFrags_kde_dif_DBL.pkl \
2> ampFrags_kde_dif_DBL.log
# checking output
!cd $workDir; \
tail ampFrags_kde_dif_DBL.log
In [400]:
# none
!cd $workDir; \
SIPSim KDE_info \
-s ampFrags_kde.pkl \
> ampFrags_kde_info.txt
# diffusion
!cd $workDir; \
SIPSim KDE_info \
-s ampFrags_kde_dif.pkl \
> ampFrags_kde_dif_info.txt
# diffusion + DBL
!cd $workDir; \
SIPSim KDE_info \
-s ampFrags_kde_dif_DBL.pkl \
> ampFrags_kde_dif_DBL_info.txt
In [405]:
%%R -i workDir
inFile = file.path(workDir, 'ampFrags_kde_info.txt')
df.raw = read.delim(inFile, sep='\t')
df.raw$stage = 'raw'
inFile = file.path(workDir, 'ampFrags_kde_dif_info.txt')
df.dif = read.delim(inFile, sep='\t')
df.dif$stage = 'diffusion'
inFile = file.path(workDir, 'ampFrags_kde_dif_DBL_info.txt')
df.DBL = read.delim(inFile, sep='\t')
df.DBL$stage = 'diffusion +\nDBL'
df = rbind(df.raw, df.dif, df.DBL)
df.dif = ''
df.DBL = ''
df %>% head(n=3)
In [406]:
%%R
df$stage = factor(df$stage, levels=c('raw', 'diffusion', 'diffusion +\nDBL'))
ggplot(df, aes(stage)) +
geom_boxplot(aes(y=min), color='red') +
geom_boxplot(aes(y=median), color='darkgreen') +
geom_boxplot(aes(y=max), color='blue') +
scale_y_continuous(limits=c(1.3, 2)) +
labs(y = 'Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank()
)
In [407]:
!cd $workDir; \
SIPSim communities \
$genomeDir/genome_index.txt \
--n_comm 2 \
> comm.txt
In [408]:
%%R -w 900 -i workDir
setwd(workDir)
tbl = read.delim('comm.txt', sep='\t')
tbl$library = as.character(tbl$library)
ggplot(tbl, aes(rank, rel_abund_perc, color=library, group=taxon_name)) +
geom_point() +
scale_y_log10() +
labs(x='Rank', y='Relative abundance (%)') +
theme_bw() +
theme(
text=element_text(size=16),
legend.position='none'
)
In [412]:
!cd $workDir; \
SIPSim incorpConfigExample \
--percTaxa 10 \
--percIncorpUnif 100 \
> PT10_PI100.config
# checking output
!cd $workDir; \
head PT10_PI100.config
In [543]:
!cd $workDir; \
SIPSim isotope_incorp \
ampFrags_kde_dif_DBL.pkl \
PT10_PI100.config \
--comm comm.txt \
--np 24 \
> ampFrags_kde_dif_DBL_incorp.pkl \
2> ampFrags_kde_dif_DBL_incorp.log
# checking lag
!cd $workDir; \
tail ampFrags_kde_dif_DBL_incorp.log
In [544]:
!cd $workDir; \
SIPSim BD_shift \
ampFrags_kde_dif_DBL.pkl \
ampFrags_kde_dif_DBL_incorp.pkl \
--np 30 \
> ampFrags_kde_dif_DBL_incorp_BD-shift.txt \
2> ampFrags_kde_dif_DBL_incorp_BD-shift.log
# checking log
!cd $workDir; \
tail ampFrags_kde_dif_DBL_incorp_BD-shift.log
In [545]:
%%R -i workDir -w 800 -h 300
inFile = file.path(workDir, 'ampFrags_kde_dif_DBL_incorp_BD-shift.txt')
tbl = read.csv(inFile, sep='\t')
tbl$lib2 = as.character(tbl$lib2)
ggplot(tbl, aes(BD_shift, fill=lib2)) +
geom_histogram(position='dodge', alpha=0.5, binwidth=0.01) +
theme_bw() +
theme(
text = element_text(size=16)
)
In [546]:
%%R
tbl.s = tbl %>%
mutate(incorporator = ifelse(BD_shift > 0.05, TRUE, FALSE)) %>%
mutate(incorporator = ifelse(is.na(incorporator), 'NA', incorporator)) %>%
group_by(lib2, incorporator) %>%
summarize(n_incorps = n())
ggplot(tbl.s, aes(incorporator, n_incorps)) +
geom_bar(stat='identity') +
labs(y = 'Count', title='Number of incorporators\n(according to BD shift)') +
facet_grid(lib2 ~ .) +
theme(
text = element_text(size=16)
)
In [547]:
!cd $workDir; \
SIPSim gradient_fractions \
comm.txt \
> fracs.txt
In [548]:
%%R -i workDir -w 600 -h 300
setwd(workDir)
tbl = read.delim('fracs.txt', sep='\t')
ggplot(tbl, aes(fraction, fraction_size)) +
geom_point() +
facet_grid(library ~ .) +
labs(y='fraction size') +
theme_bw() +
theme(
text=element_text(size=16)
)
In [549]:
%%R -w 500 -h 300
tbl$library = as.character(tbl$library)
ggplot(tbl, aes(library, fraction_size)) +
geom_boxplot() +
labs(y='fraction size') +
theme_bw() +
theme(
text=element_text(size=16)
)
In [550]:
!cd $workDir; \
SIPSim OTU_table \
ampFrags_kde_dif_DBL_incorp.pkl \
comm.txt \
fracs.txt \
--abs 1e9 \
--np 24 \
> OTU_n2_abs1e9.txt \
2> OTU_n2_abs1e9.log
# checking log
!cd $workDir; \
tail OTU_n2_abs1e9.log
In [981]:
%%R -i workDir
inFile = file.path(workDir, 'OTU_n2_abs1e9.txt')
# loading file
df = read.delim(inFile, sep='\t')
In [982]:
%%R
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
In [983]:
%%R -w 800 -h 400
# plotting absolute abundances
df.s = df %>%
group_by(library, BD_mid) %>%
summarize(total_count = sum(count))
## plot
p = ggplot(df.s, aes(BD_mid, total_count)) +
geom_point() +
geom_area(stat='identity', alpha=0.3, position='dodge') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Total abundance') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16)
)
p
In [984]:
%%R -w 800 -h 500
# plotting number of taxa at each BD
df.nt = df %>%
filter(count > 0) %>%
group_by(library, BD_mid) %>%
summarize(n_taxa = n())
## plot
p = ggplot(df.nt, aes(BD_mid, n_taxa)) +
geom_point() +
geom_area(stat='identity', alpha=0.3, position='dodge') +
#geom_histogram(stat='identity') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [985]:
%%R -w 800 -h 400
# plotting relative abundances
## plot
p = ggplot(df, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Absolute abundance') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity', position='dodge', alpha=0.5)
In [986]:
%%R -w 800 -h 400
p +
geom_area(stat='identity', position='fill') +
labs(x='Buoyant density', y='Relative abundance')
In [987]:
%%R
shannon_index_long = function(df, abundance_col, ...){
# calculating shannon diversity index from a 'long' formated table
## community_col = name of column defining communities
## abundance_col = name of column defining taxon abundances
cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
df.s = df %>%
group_by_(...) %>%
mutate_(REL_abundance = cmd) %>%
mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
ungroup() %>%
dplyr::select(-REL_abundance, -pi__ln_pi) %>%
distinct_(...)
return(df.s)
}
In [990]:
%%R -w 800 -h 400
# calculating shannon
df.shan = shannon_index_long(df, 'count', 'library', 'fraction')
## plot
p = ggplot(df.shan, aes(BD_mid, shannon, color=library, group=library)) +
geom_point() +
geom_line() +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Shannon index') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [695]:
%%R -w 600 -h 300
max_BD_range = max(df$BD_mid) - min(df$BD_mid)
df.r = df %>%
filter(count > 0) %>%
group_by(taxon) %>%
summarize(mean_count = mean(count),
min_BD = min(BD_mid),
max_BD = max(BD_mid),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / max_BD_range * 100) %>%
ungroup()
# plotting
ggplot(df.r, aes(mean_count, BD_range_perc, group=taxon)) +
geom_point() +
scale_x_log10() +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
#geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
theme_bw() +
theme(
text = element_text(size=16)
)
In [992]:
!cd $workDir; \
SIPSim OTU_PCR \
OTU_n2_abs1e9.txt \
--debug \
> OTU_n2_abs1e9_PCR.txt
In [996]:
%%R -w 500 -h 450
# loading file
F = file.path(workDir, 'OTU_n2_abs1e9_PCR.txt')
df.SIM = read.delim(F, sep='\t')
ggplot(df.SIM, aes(init_molarity, final_molarity)) +
geom_point() +
labs(x='Initial molarity', y='Final molarity') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [1007]:
!cd $workDir; \
SIPSim OTU_subsample \
--dist normal \
--dist_params loc:30000,scale:5000 \
OTU_n2_abs1e9.txt \
> OTU_n2_abs1e9_subNorm.txt
In [1008]:
%%R -h 300
setwd(workDir)
df = read.csv('OTU_n2_abs1e9_subNorm.txt', sep='\t')
df.s = df %>%
group_by(library, fraction) %>%
summarize(total_count = sum(count)) %>%
ungroup() %>%
mutate(library = as.character(library))
ggplot(df.s, aes(library, total_count)) +
geom_boxplot() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [1014]:
%%R -i workDir
setwd(workDir)
# loading file
df.abs = read.delim('OTU_n2_abs1e9.txt', sep='\t')
df.sub = read.delim('OTU_n2_abs1e9_subNorm.txt', sep='\t')
lib.reval = c('1' = 'control',
'2' = 'treatment')
df.abs = mutate(df.abs, library = plyr::revalue(as.character(library), lib.reval))
df.sub = mutate(df.sub, library = plyr::revalue(as.character(library), lib.reval))
In [1015]:
%%R -w 700 -h 800
# plotting absolute abundances
## plot
p = ggplot(df.abs, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank(),
legend.position = 'none',
plot.margin=unit(c(1,1,0.1,1), "cm")
)
p1 = p + geom_area(stat='identity', position='dodge', alpha=0.5) +
labs(y='Total community\n(absolute abundance)')
# plotting absolute abundances of subsampled
## plot
p = ggplot(df.sub, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p + geom_area(stat='identity', position='dodge', alpha=0.5) +
labs(y='Subsampled community\n(absolute abundance)') +
theme(
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank(),
plot.margin=unit(c(0.1,1,0.1,1), "cm")
)
# plotting relative abundances of subsampled
p3 = p + geom_area(stat='identity', position='fill') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(y='Subsampled community\n(relative abundance)') +
theme(
axis.title.y = element_text(vjust=1),
plot.margin=unit(c(0.1,1,1,1.35), "cm")
)
# combining plots
grid.arrange(p1, p2, p3, ncol=1)
In [1016]:
%%R -i figureDir
# saving figure
outFile = paste(c(figureDir, 'abundDist_example.pdf'), collapse='/')
pdf(outFile, width=10.5, height=12)
grid.arrange(p1, p2, p3, ncol=1)
dev.off()
In [1003]:
!cd $workDir; \
SIPSim OTU_wideLong -w \
OTU_n2_abs1e9_PCR_subNorm.txt \
> OTU_n2_abs1e9_PCR_subNorm_w.txt
In [1004]:
!cd $workDir; \
SIPSim OTU_sampleData \
OTU_n2_abs1e9_PCR_subNorm.txt \
> OTU_n2_abs1e9_PCR_subNorm_meta.txt
In [567]:
%%bash -s $workDir
cd $1
export PATH=/home/nick/notebook/SIPSim/lib/R/:$PATH
# making phyloseq object from OTU table
phyloseq_make.r \
OTU_n2_abs1e9_subNorm_w.txt \
-s OTU_n2_abs1e9_subNorm_meta.txt \
> OTU_n2_abs1e9_subNorm.physeq
## making ordination
phyloseq_ordination.r \
OTU_n2_abs1e9_subNorm.physeq \
OTU_n2_abs1e9_subNorm_bray-NMDS.pdf
## filtering phyloseq object to just taxa/samples of interest
phyloseq_edit.r \
OTU_n2_abs1e9_subNorm.physeq \
--BD_min 1.71 --BD_max 1.75 --occur 0.25 \
> OTU_n2_abs1e9_subNorm_filt.physeq
## making ordination
phyloseq_ordination.r \
OTU_n2_abs1e9_subNorm_filt.physeq \
OTU_n2_abs1e9_subNorm_filt_bray-NMDS.pdf
convert OTU_n2_abs1e9_subNorm_bray-NMDS.pdf OTU_n2_abs1e9_subNorm_bray-NMDS.png
convert OTU_n2_abs1e9_subNorm_filt_bray-NMDS.pdf OTU_n2_abs1e9_subNorm_filt_bray-NMDS.png
In [568]:
os.chdir(workDir)
Image(filename='OTU_n2_abs1e9_subNorm_bray-NMDS.png')
Out[568]:
In [569]:
os.chdir(workDir)
Image(filename='OTU_n2_abs1e9_subNorm_filt_bray-NMDS.png')
Out[569]:
In [590]:
%%bash -s $workDir
cd $1
export PATH=/home/nick/notebook/SIPSim/lib/R/:$PATH
# Chuck's method
## DESeq2
phyloseq_DESeq2.r \
OTU_n2_abs1e9_subNorm_filt.physeq \
--log2 0.25 \
> OTU_n2_abs1e9_subNorm_DESeq2
## Confusion matrix
DESeq2_confuseMtx.r \
ampFrags_kde_dif_DBL_incorp_BD-shift.txt \
OTU_n2_abs1e9_subNorm_DESeq2 \
--padjBH 0.1
In [591]:
%%R -i workDir -w 600 -h 400
setwd(workDir)
byClass = read.csv('DESeq2-cMtx_byClass.csv')
ggplot(byClass, aes(X, byClass)) +
geom_point() +
labs(y='value') +
theme_bw() +
theme(
text=element_text(size=16),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x=element_blank()
)
In [592]:
%%bash -s $workDir
cd $1
export PATH=/home/nick/notebook/SIPSim/lib/R/:$PATH
# altHypothesis = 'greater'
## DESeq2
phyloseq_DESeq2.r \
OTU_n2_abs1e9_subNorm_filt.physeq \
--log2 0.25 \
--hypo greater \
> OTU_n2_abs1e9_subNorm_DESeq2
## Confusion matrix
DESeq2_confuseMtx.r \
ampFrags_kde_dif_DBL_incorp_BD-shift.txt \
OTU_n2_abs1e9_subNorm_DESeq2 \
--padj 0.1
In [593]:
%%R -i workDir -w 600 -h 400
setwd(workDir)
byClass = read.csv('DESeq2-cMtx_byClass.csv')
ggplot(byClass, aes(X, byClass)) +
geom_point() +
labs(y='value') +
theme_bw() +
theme(
text=element_text(size=16),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x=element_blank()
)
In [26]:
%%R
clsfy = function(guess,known){
if(is.na(guess) | is.na(known)){
return(NA)
}
if(guess == TRUE){
if(guess == known){
return('True positive')
} else {
return('False positive')
}
} else
if(guess == FALSE){
if(guess == known){
return('True negative')
} else {
return('False negative')
}
} else {
stop('Error: true or false needed')
}
}
In [1028]:
%%R -i workDir -w 1000 -h 450
setwd(workDir)
df = read.csv('DESeq2-cMtx_data.csv')
df = df %>%
filter(! is.na(log2FoldChange)) %>%
mutate(taxon = reorder(taxon, -log2FoldChange),
cls = mapply(clsfy, incorp.pred, incorp.known))
df %>% head(n=3)
In [1034]:
%%R -w 1000 -h 450
df.TN = df %>% filter(cls == 'True negative')
df.TP = df %>% filter(cls == 'True positive')
df.FP = df %>% filter(cls == 'False negative')
ggplot(df, aes(taxon, log2FoldChange, color=cls, alpha=0.7,
ymin=log2FoldChange - lfcSE, ymax=log2FoldChange + lfcSE)) +
geom_pointrange(size=0.6) +
geom_pointrange(data=df.TP, size=0.6, alpha=0.2) +
geom_pointrange(data=df.FP, size=0.6, alpha=0.2) +
labs(x = 'Taxon', y = 'Log2 fold change') +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title=element_blank(),
axis.text.x = element_blank(),
legend.position = 'bottom'
)
In [597]:
%%R -i figureDir
outFile = paste(c(figureDir, 'l2fc_example.pdf'), collapse='/')
ggsave(outFile, width=10, height=4.5)
Notes:
Red circles = true positives
False positives should increase with taxon GC
In [114]:
%%R -i workDir
setwd(workDir)
df.ds = read.csv('DESeq2-cMtx_data.csv')
# loading file
df.otu = read.delim('OTU_n2_abs1e9_subNorm.txt', sep='\t') %>%
filter(BD_min >= 1.71, BD_max <= 1.75) %>%
group_by(library, taxon) %>%
mutate(min_rel_abund = min(rel_abund),
mean_rel_abund = mean(rel_abund)) %>%
ungroup() %>%
distinct(library, taxon)
df.j = inner_join(df.otu, df.ds, c('taxon' = 'taxon'))
df.j %>% head(n=3) %>% as.data.frame
In [115]:
%%R
# classifying
df.j.f = df.j %>%
filter(! is.na(log2FoldChange)) %>%
mutate(cls = mapply(clsfy, incorp.pred, incorp.known)) %>%
filter(cls != 'True negative')
In [116]:
%%R
df.L1 = df.j.f %>% filter(library == 1) %>%
dplyr::select(taxon, min_rel_abund, mean_rel_abund, log2FoldChange, cls) #%>%
# rename('lib1_mean_rel_abund' = mean_rel_abund)
df.L2 = df.j.f %>% filter(library == 2) %>%
dplyr::select(taxon, min_rel_abund, mean_rel_abund, log2FoldChange, cls) #%>%
# rename('lib2_mean_rel_abund' = mean_rel_abund)
df.j = inner_join(df.L1, df.L2, c('taxon' = 'taxon'))
df.j %>% head(n=3) %>% as.data.frame
In [117]:
%%R -w 600 -h 400
ggplot(df.j, aes(mean_rel_abund.x, mean_rel_abund.y, color=cls.x)) +
geom_point(alpha=0.7) +
#geom_density2d() +
scale_x_log10() +
scale_y_log10() +
labs(x = 'Relative abundance\n(control)', y='Relative abundance\n(treatment)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [118]:
%%R -h 400
ggplot(df.j.f, aes(cls, mean_rel_abund)) +
geom_boxplot() +
scale_y_log10() +
labs(y='relative abundance') +
facet_grid(library ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank()
)
In [119]:
%%R -w 600 -h 400
ggplot(df.j.f, aes(mean_rel_abund, log2FoldChange, color=cls)) +
geom_point(alpha=0.7) +
geom_density2d() +
scale_x_log10() +
labs(x = 'Relative abundance', y='log2 fold change') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [120]:
%%R
library(ltm)
levs = c('True positive', 'False negative')
df.j.f %>%
mutate(cls = factor(cls, levels=levs)) %>%
group_by(library) %>%
summarize(biserial_cor = biserial.cor(mean_rel_abund, cls))
In [121]:
%%R
t.test(mean_rel_abund ~ cls, data=df.j.f)
In [67]:
%%R -i workDir
setwd(workDir)
df.ds = read.csv('DESeq2-cMtx_data.csv')
df.comm = read.delim('comm.txt', sep='\t') %>%
mutate(rel_abund_perc = rel_abund_perc / 100) %>%
rename('preFrac_rel_abund' = rel_abund_perc)
df.j = inner_join(df.ds, df.comm, c('taxon' = 'taxon_name'))
df.j %>% head(n=3)
In [70]:
%%R
# classifying
df.j.f = df.j %>%
filter(! is.na(log2FoldChange)) %>%
mutate(cls = mapply(clsfy, incorp.pred, incorp.known)) %>%
filter(cls != 'True negative',
library == 2)
df.j.f %>% head(n=3)
In [73]:
%%R -h 300
ggplot(df.j.f, aes(cls, preFrac_rel_abund)) +
geom_boxplot() +
scale_y_log10() +
labs(y='Relative abundance\n(pre-fractionation)') +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank()
)
In [1057]:
%%R -i workDir -w 1000 -h 450
setwd(workDir)
df.ds = read.csv('DESeq2-cMtx_data.csv')
# loading file
df.otu = read.delim('OTU_n2_abs1e9_subNorm.txt', sep='\t')
#df.otu = read.delim('OTU_n2_abs1e9.txt', sep='\t')
df.j = inner_join(df.otu, df.ds, c('taxon' = 'taxon'))
# edit
lib.reval = c('1' = 'control',
'2' = 'treatment')
df.j = mutate(df.j, library = plyr::revalue(as.character(library), lib.reval))
df.j %>% head(n=3)
In [1058]:
%%R
# DESeq2 params
BD.win.min = 1.71
BD.win.max = 1.75
In [1069]:
%%R -w 800 -h 400
# plotting relative abundances: all
## plot
p = ggplot(df.j, aes(BD_mid, rel_abund, fill=taxon)) +
geom_vline(xintercept=c(BD.win.min, BD.win.max), linetype='dashed', alpha=0.8) +
labs(x='Buoyant density', y='Relative abundance', title='All taxa') +
facet_grid(library ~ ., scales='free_y') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p = p + geom_area(stat='identity', position='dodge', alpha=0.5)
p
In [1070]:
%%R -w 800 -h 400
# plotting relative abundances
df.j.TP = df.j %>%
filter(incorp.known == TRUE & incorp.pred == TRUE)
## plot
p = ggplot(df.j.TP, aes(BD_mid, rel_abund, fill=taxon)) +
geom_vline(xintercept=c(BD.win.min, BD.win.max), linetype='dashed', alpha=0.8) +
labs(x='Buoyant density', y='Relative abundance', title='True positives') +
facet_grid(library ~ ., scales='free_y') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p1 = p + geom_area(stat='identity', position='dodge', alpha=0.5)
p1
In [1071]:
%%R -w 800 -h 400
# plotting relative abundances
df.j.FN = df.j %>%
filter(incorp.known == TRUE & incorp.pred == FALSE)
## plot
p = ggplot(df.j.FN, aes(BD_mid, rel_abund, fill=taxon)) +
geom_vline(xintercept=c(BD.win.min, BD.win.max), linetype='dashed', alpha=0.8) +
labs(x='Buoyant density', y='Relative abundance', title='False negatives') +
facet_grid(library ~ ., scales='free_y') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p + geom_area(stat='identity', position='dodge', alpha=0.5)
p2
In [1072]:
%%R -i figureDir -h 550 -w 650
outFile = paste(c(figureDir, 'abundDist_TP-FN_example.pdf'), collapse='/')
p1.e = p1 + theme(axis.title.x = element_blank())
pdf(outFile, width=13, height=11)
grid.arrange(p1.e, p2, ncol=1)
dev.off()
grid.arrange(p1.e, p2, ncol=1)
In [1073]:
%%R
# checking on number of incorporators
df.j %>% filter(BD_shift > 0.05) %>% distinct(taxon) %>% nrow %>% print
df.j %>% filter(incorp.known == TRUE) %>% distinct(taxon) %>% nrow %>% print
In [1076]:
%%R
df.j %>% head
In [1077]:
%%R -w 750 -h 6000
# plotting relative abundances
df.j.f = df.j %>%
filter(incorp.known == TRUE) %>%
mutate(taxon= gsub('_', '\n', taxon))
df.j.f$taxon = reorder(df.j.f$taxon, -df.j.f$log2FoldChange)
quant = function(x, p=0.95){
x = x %>% as.numeric
return(quantile(x, probs=c(0.9))[1] %>% as.numeric)
}
df.j.f.txt = df.j.f %>%
group_by(taxon, BD_shift) %>%
summarize(BD_mid = max(BD_mid),
count = quant(count),
rel_abund = quant(rel_abund)) %>%
ungroup()
## plot
p = ggplot(df.j.f, aes(BD_mid, rel_abund)) +
geom_point(aes(color=incorp.pred)) +
geom_text(data=df.j.f.txt, aes(label=BD_shift)) +
scale_color_manual(values=c('darkgreen', 'purple')) +
geom_vline(xintercept=c(BD.win.min, BD.win.max), linetype='dashed', alpha=0.8) +
labs(x='Buoyant density', y='Abundance') +
facet_grid(taxon ~ ., scale='free_y') +
theme_bw() +
theme(
text = element_text(size=16)
)
p2 = p + geom_area(stat='identity', position='dodge', alpha=0.5, aes(fill=library, color=incorp.pred))
p2