In [7]:
import os
import glob
import re
import nestly
In [8]:
%load_ext rpy2.ipython
%load_ext pushnote
In [9]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)
In [10]:
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')
In [16]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'rep3')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
fragFile= '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags.pkl'
nreps = 3
In [17]:
# building tree structure
nest = nestly.Nest()
# varying params
nest.add('rep', [x + 1 for x in xrange(nreps)])
## set params
nest.add('abs', ['1e9'], create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [2], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('bandwidth', [0.6], create_dir=False)
nest.add('comm_params', ['mean:-7.6836085,sigma:0.9082843'], create_dir=False)
# building directory tree
nest.build(buildDir)
# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')
In [18]:
%%writefile $bashFile
#!/bin/bash
export PATH={R_dir}:$PATH
echo '#-- SIPSim pipeline --#'
echo '# converting fragments to KDE'
SIPSim fragment_KDE \
{fragFile} \
> ampFrags_KDE.pkl
echo '# making a community file'
SIPSim KDE_info \
-t ampFrags_KDE.pkl \
> taxon_names.txt
SIPSim communities \
--abund_dist_p {comm_params} \
taxon_names.txt \
> comm.txt
echo '# adding diffusion'
SIPSim diffusion \
ampFrags_KDE.pkl \
--bw {bandwidth} \
--np {np} \
> ampFrags_KDE_dif.pkl
echo '# adding DBL contamination'
SIPSim DBL \
ampFrags_KDE_dif.pkl \
--bw {bandwidth} \
--np {np} \
> ampFrags_KDE_dif_DBL.pkl
echo '# making incorp file'
SIPSim incorpConfigExample \
--percTaxa {percTaxa} \
--percIncorpUnif {percIncorp} \
> {percTaxa}_{percIncorp}.config
echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
ampFrags_KDE_dif_DBL.pkl \
{percTaxa}_{percIncorp}.config \
--comm comm.txt \
--bw {bandwidth} \
--np {np} \
> ampFrags_KDE_dif_DBL_inc.pkl
echo '# simulating gradient fractions'
SIPSim gradient_fractions \
comm.txt \
> fracs.txt
echo '# simulating an OTU table'
SIPSim OTU_table \
ampFrags_KDE_dif_DBL_inc.pkl \
comm.txt \
fracs.txt \
--abs {abs} \
--np {np} \
> OTU_abs{abs}.txt
#-- w/ PCR simulation --#
echo '# simulating PCR'
SIPSim OTU_PCR \
OTU_abs{abs}.txt \
> OTU_abs{abs}_PCR.txt
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
--dist {subsample_dist} \
--dist_params mean:{subsample_mean},sigma:{subsample_scale} \
--min_size {subsample_min} \
--max_size {subsample_max} \
OTU_abs{abs}_PCR.txt \
> OTU_abs{abs}_PCR_sub.txt
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_w.txt
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_meta.txt
#-- w/out PCR simulation --#
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
--dist {subsample_dist} \
--dist_params mean:{subsample_mean},sigma:{subsample_scale} \
--min_size {subsample_min} \
--max_size {subsample_max} \
OTU_abs{abs}.txt \
> OTU_abs{abs}_sub.txt
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
OTU_abs{abs}_sub.txt \
> OTU_abs{abs}_sub_w.txt
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
OTU_abs{abs}_sub.txt \
> OTU_abs{abs}_sub_meta.txt
In [19]:
!chmod 777 $bashFile
!cd $workDir; \
nestrun --template-file $bashFile -d rep3 --log-file log.txt -j 3
In [20]:
%pushnote SIPsim rep3 complete
In [21]:
OTU_files = !find $buildDir -name "OTU_abs1e9_sub.txt"
OTU_files
Out[21]:
In [22]:
%%R -i OTU_files
# loading files
df.SIM = list()
for (x in OTU_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3/', '', x)
SIM_rep = gsub('/OTU_abs1e9_sub.txt', '', SIM_rep)
df.SIM[[SIM_rep]] = read.delim(x, sep='\t')
}
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head(n=3)
In [23]:
comm_files = !find $buildDir -name "comm.txt"
comm_files
Out[23]:
In [24]:
%%R -i comm_files
df.SIM.comm = list()
for (x in comm_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3/', '', x)
SIM_rep = gsub('/comm.txt', '', SIM_rep)
df.SIM.comm[[SIM_rep]] = read.delim(x, sep='\t')
}
df.SIM.comm = do.call(rbind, df.SIM.comm)
df.SIM.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM.comm))
rownames(df.SIM.comm) = 1:nrow(df.SIM.comm)
df.SIM.comm = df.SIM.comm %>%
rename('bulk_abund' = rel_abund_perc) %>%
mutate(bulk_abund = bulk_abund / 100)
df.SIM.comm %>% head(n=3)
In [25]:
%%R -w 800 -h 400
# Plotting the pre-fractionation abundances of each taxon
df.SIM.comm.s = df.SIM.comm %>%
group_by(taxon_name) %>%
summarize(median_rank = median(rank),
mean_abund = mean(bulk_abund),
sd_abund = sd(bulk_abund))
df.SIM.comm.s$taxon_name = reorder(df.SIM.comm.s$taxon_name, -df.SIM.comm.s$mean_abund)
ggplot(df.SIM.comm.s, aes(taxon_name, mean_abund,
ymin=mean_abund-sd_abund,
ymax=mean_abund+sd_abund)) +
geom_linerange(alpha=0.4) +
geom_point(alpha=0.6, size=1.2) +
scale_y_log10() +
labs(x='taxon', y='Relative abundance', title='Pre-fractionation abundance') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_blank()
)
In [26]:
%%R
## joining SIP & comm (pre-fractionation)
df.SIM.j = inner_join(df.SIM, df.SIM.comm, c('library' = 'library',
'taxon' = 'taxon_name',
'SIM_rep' = 'SIM_rep')) %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.j %>% head(n=3)
In [27]:
%%R
# calculating BD range
df.SIM.j.f = df.SIM.j %>%
filter(count > 0) %>%
group_by(SIM_rep) %>%
mutate(max_BD_range = max(BD_mid) - min(BD_mid)) %>%
ungroup() %>%
group_by(SIM_rep, taxon) %>%
summarize(mean_bulk_abund = mean(bulk_abund),
min_BD = min(BD_mid),
max_BD = max(BD_mid),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / first(max_BD_range) * 100) %>%
ungroup()
df.SIM.j.f %>% head(n=3) %>% as.data.frame
In [28]:
%%R -h 300 -w 550
## plotting
ggplot(df.SIM.j.f, aes(mean_bulk_abund, BD_range_perc, color=SIM_rep)) +
geom_point(alpha=0.5, shape='O') +
scale_x_log10() +
scale_y_continuous() +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
#geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
In [29]:
%%R
# giving value to missing abundances
min.pos.val = df.SIM.j %>%
filter(rel_abund > 0) %>%
group_by() %>%
mutate(min_abund = min(rel_abund)) %>%
ungroup() %>%
filter(rel_abund == min_abund)
min.pos.val = min.pos.val[1,'rel_abund'] %>% as.numeric
imp.val = min.pos.val / 10
# convert numbers
df.SIM.j[df.SIM.j$rel_abund == 0, 'abundance'] = imp.val
# another closure operation
df.SIM.j = df.SIM.j %>%
group_by(SIM_rep, fraction) %>%
mutate(rel_abund = rel_abund / sum(rel_abund))
# status
cat('Below detection level abundances converted to: ', imp.val, '\n')
In [30]:
%%R
shannon_index_long = function(df, abundance_col, ...){
# calculating shannon diversity index from a 'long' formated table
## community_col = name of column defining communities
## abundance_col = name of column defining taxon abundances
df = df %>% as.data.frame
cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
df.s = df %>%
group_by_(...) %>%
mutate_(REL_abundance = cmd) %>%
mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
ungroup() %>%
dplyr::select(-REL_abundance, -pi__ln_pi) %>%
distinct_(...)
return(df.s)
}
In [31]:
%%R -w 800 -h 300
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM.j, 'count', 'library', 'fraction') %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.shan.s = df.SIM.shan %>%
group_by(BD_bin = ntile(BD_mid, 24)) %>%
summarize(mean_BD = mean(BD_mid),
mean_shannon = mean(shannon),
sd_shannon = sd(shannon))
# plotting
p = ggplot(df.SIM.shan.s, aes(mean_BD, mean_shannon,
ymin=mean_shannon-sd_shannon,
ymax=mean_shannon+sd_shannon)) +
geom_pointrange() +
labs(x='Buoyant density', y='Shannon index') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [32]:
%%R -w 800 -h 350
df.SIM.j.var = df.SIM.j %>%
group_by(SIM_rep, fraction) %>%
mutate(variance = var(rel_abund)) %>%
ungroup() %>%
distinct(SIM_rep, fraction) %>%
select(SIM_rep, fraction, variance, BD_mid)
ggplot(df.SIM.j.var, aes(BD_mid, variance, color=SIM_rep)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [33]:
OTU_files = !find $buildDir -name "OTU_abs1e9.txt"
OTU_files
Out[33]:
In [34]:
%%R -i OTU_files
# loading files
df.abs = list()
for (x in OTU_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/rep3/', '', x)
SIM_rep = gsub('/OTU_abs1e9.txt', '', SIM_rep)
df.abs[[SIM_rep]] = read.delim(x, sep='\t')
}
df.abs = do.call('rbind', df.abs)
df.abs$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.abs))
rownames(df.abs) = 1:nrow(df.abs)
df.abs %>% head(n=3)
In [35]:
%%R -w 800
ggplot(df.abs, aes(BD_mid, count, fill=taxon)) +
geom_area(stat='identity', position='dodge', alpha=0.5) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)') +
facet_grid(SIM_rep ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)
In [36]:
%%R -w 800
p1 = ggplot(df.abs %>% filter(BD_mid < 1.7), aes(BD_mid, count, fill=taxon, color=taxon)) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)') +
facet_grid(SIM_rep ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)
p2 = p1 + geom_line(alpha=0.25) + scale_y_log10()
p1 = p1 + geom_area(stat='identity', position='dodge', alpha=0.5)
grid.arrange(p1, p2, ncol=2)
In [37]:
%%R -w 800
p1 = ggplot(df.abs %>% filter(BD_mid > 1.72), aes(BD_mid, count, fill=taxon, color=taxon)) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)') +
facet_grid(SIM_rep ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)
p2 = p1 + geom_line(alpha=0.25) + scale_y_log10()
p1 = p1 + geom_area(stat='identity', position='dodge', alpha=0.5)
grid.arrange(p1, p2, ncol=2)
In [ ]: