In [1]:
import os
import glob
import re
import nestly
In [2]:
%load_ext rpy2.ipython
%load_ext pushnote
In [3]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
In [4]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'Day1_rep10')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
fragFile= '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags.pkl'
targetFile = '/home/nick/notebook/SIPSim/dev/fullCyc/CD-HIT/target_taxa.txt'
physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'
nreps = 10
prefrac_comm_abundance = '1e9'
seq_per_fraction = ['lognormal', 9.432, 0.5, 10000, 30000] # dist, mean, scale, min, max
bulk_days = [1]
nprocs = 16
In [5]:
# building tree structure
nest = nestly.Nest()
## varying params
nest.add('rep', [x + 1 for x in xrange(nreps)])
## set params
nest.add('bulk_day', bulk_days, create_dir=False)
nest.add('abs', [prefrac_comm_abundance], create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [nprocs], create_dir=False)
nest.add('subsample_dist', [seq_per_fraction[0]], create_dir=False)
nest.add('subsample_mean', [seq_per_fraction[1]], create_dir=False)
nest.add('subsample_scale', [seq_per_fraction[2]], create_dir=False)
nest.add('subsample_min', [seq_per_fraction[3]], create_dir=False)
nest.add('subsample_max', [seq_per_fraction[4]], create_dir=False)
### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('targetFile', [targetFile], create_dir=False)
nest.add('physeqDir', [physeqDir], create_dir=False)
nest.add('physeq_bulkCore', [physeq_bulkCore], create_dir=False)
# building directory tree
nest.build(buildDir)
# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')
In [6]:
%%writefile $bashFile
#!/bin/bash
export PATH={R_dir}:$PATH
#-- making DNA pool similar to gradient of interest
echo '# Creating comm file from phyloseq'
phyloseq2comm.r {physeqDir}{physeq_bulkCore} -s 12C-Con -d {bulk_day} > {physeq_bulkCore}_comm.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm.txt
echo '## Adding target taxa to comm file'
comm_add_target.r {physeq_bulkCore}_comm.txt {targetFile} > {physeq_bulkCore}_comm_target.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm_target.txt
echo '## parsing out genome fragments to make simulated DNA pool resembling the gradient of interest'
## all OTUs without an associated reference genome will be assigned a random reference (of the reference genome pool)
### this is done through --NA-random
SIPSim fragment_KDE_parse {fragFile} {physeq_bulkCore}_comm_target.txt \
--rename taxon_name --NA-random > fragsParsed.pkl
echo '#-- SIPSim pipeline --#'
echo '# converting fragments to KDE'
SIPSim fragment_KDE \
fragsParsed.pkl \
> fragsParsed_KDE.pkl
echo '# adding diffusion'
SIPSim diffusion \
fragsParsed_KDE.pkl \
--np {np} \
> fragsParsed_KDE_dif.pkl
echo '# adding DBL contamination'
SIPSim DBL \
fragsParsed_KDE_dif.pkl \
--np {np} \
> fragsParsed_KDE_dif_DBL.pkl
echo '# making incorp file'
SIPSim incorpConfigExample \
--percTaxa {percTaxa} \
--percIncorpUnif {percIncorp} \
> {percTaxa}_{percIncorp}.config
echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
fragsParsed_KDE_dif_DBL.pkl \
{percTaxa}_{percIncorp}.config \
--comm {physeq_bulkCore}_comm_target.txt \
--np {np} \
> fragsParsed_KDE_dif_DBL_inc.pkl
#echo '# calculating BD shift from isotope incorporation'
#SIPSim BD_shift \
# fragsParsed_KDE_dif_DBL.pkl \
# fragsParsed_KDE_dif_DBL_inc.pkl \
# --np {np} \
# > fragsParsed_KDE_dif_DBL_inc_BD-shift.txt
echo '# simulating gradient fractions'
SIPSim gradient_fractions \
{physeq_bulkCore}_comm_target.txt \
> fracs.txt
echo '# simulating an OTU table'
SIPSim OTU_table \
fragsParsed_KDE_dif_DBL_inc.pkl \
{physeq_bulkCore}_comm_target.txt \
fracs.txt \
--abs {abs} \
--np {np} \
> OTU_abs{abs}.txt
echo '# simulating PCR'
SIPSim OTU_PCR \
OTU_abs{abs}.txt \
> OTU_abs{abs}_PCR.txt
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
--dist {subsample_dist} \
--dist_params mean:{subsample_mean},sigma:{subsample_scale} \
--min_size {subsample_min} \
--max_size {subsample_max} \
OTU_abs{abs}_PCR.txt \
> OTU_abs{abs}_PCR_sub.txt
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_w.txt
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
OTU_abs{abs}_PCR_sub.txt \
> OTU_abs{abs}_PCR_sub_meta.txt
In [ ]:
!chmod 777 $bashFile
!cd $workDir; \
nestrun --template-file $bashFile -d Day1_rep10 --log-file log.txt -j 1
In [ ]:
%pushnote Day1_rep10 complete
In [11]:
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')
In [62]:
%%R
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'
In [26]:
%%R -i physeqDir -i physeq_SIP_core -i bulk_days
# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' &
physeq.SIP.core.m$Day %in% bulk_days,
physeq.SIP.core) %>%
filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core
In [64]:
%%R
## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%
gather(sample, abundance, 1:(ncol(df.EMP)-1))
df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample'))
df.EMP.nt = df.EMP %>%
group_by(sample) %>%
mutate(n_taxa = sum(abundance > 0)) %>%
ungroup() %>%
distinct(sample) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
df.EMP.nt %>% head(n=3)
In [27]:
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
physeq.bulk.m$Day %in% bulk_days, physeq.bulk)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk
In [28]:
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n
In [29]:
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
gather(sample, abundance, 1:ncol)
bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
dplyr::select(OTU, abundance) %>%
rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)
In [30]:
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
df.EMP.j %>% head(n=3)
In [31]:
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files
Out[31]:
In [15]:
%%R -i OTU_files
# loading files
df.SIM = list()
for (x in OTU_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
df.SIM[[SIM_rep]] = read.delim(x, sep='\t')
}
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head
In [16]:
%%R
## edit table
df.SIM.nt = df.SIM %>%
filter(count > 0) %>%
group_by(SIM_rep, library, BD_mid) %>%
summarize(n_taxa = n()) %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.nt %>% head
In [32]:
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files
Out[32]:
In [33]:
%%R -i comm_files
df.comm = list()
for (f in comm_files){
rep = gsub('.+/Day1_rep10/([0-9]+)/.+', '\\1', f)
df.comm[[rep]] = read.delim(f, sep='\t') %>%
dplyr::select(library, taxon_name, rel_abund_perc) %>%
rename('bulk_abund' = rel_abund_perc) %>%
mutate(bulk_abund = bulk_abund / 100)
}
df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)
In [34]:
%%R
## joining tables
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
'library' = 'library',
'taxon' = 'taxon_name')) %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.j %>% head(n=3)
In [49]:
%%R
# filtering & combining emperical w/ simulated data
## emperical
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
filter(abundance > 0) %>%
group_by(OTU) %>%
summarize(mean_rel_abund = mean(bulk_abund),
min_BD = min(Buoyant_density),
max_BD = max(Buoyant_density),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / max_BD_range * 100) %>%
ungroup() %>%
mutate(dataset = 'emperical',
SIM_rep = NA)
## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
filter(count > 0) %>%
group_by(SIM_rep, taxon) %>%
summarize(mean_rel_abund = mean(bulk_abund),
min_BD = min(BD_mid),
max_BD = max(BD_mid),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / max_BD_range * 100) %>%
ungroup() %>%
rename('OTU' = taxon) %>%
mutate(dataset = 'simulated')
## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
filter(BD_range_perc > 0,
mean_rel_abund > 0)
df.j$SIM_rep = reorder(df.j$SIM_rep, df.j$SIM_rep %>% as.numeric)
df.j %>% head(n=3)
In [50]:
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
geom_point(alpha=0.3) +
scale_x_log10() +
scale_y_continuous() +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank()#,
#legend.position = 'none'
)
In [54]:
%%R -i targetFile
df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)
In [55]:
%%R
# filtering to just target taxa
df.j.t = df.j %>%
filter(OTU %in% df.target$OTU)
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print
## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
geom_point(alpha=0.5, shape='O') +
scale_x_log10() +
scale_y_continuous() +
#scale_color_manual(values=c('blue', 'red')) +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank()#,
#legend.position = 'none'
)
In [58]:
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>%
filter(dataset == 'simulated') %>%
select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)
df.2 = df.j.t %>%
filter(dataset == 'emperical') %>%
select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)
df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
mutate(BD_diff_perc = BD_range_perc.y - BD_range_perc.x)
df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x %>% as.numeric)
## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
geom_point(alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
facet_wrap(~ SIM_rep.x)
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
p1
In [66]:
%%R -w 800 -h 500
ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
geom_point(alpha=0.5) +
scale_x_log10() +
labs(x='Pre-fractionation relative abundance',
y='Difference in % of gradient spanned\n(emperical - simulated)',
title='Overlapping taxa') +
facet_wrap(~ SIM_rep.x) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
In [338]:
%%R
join_abund_dists = function(df.EMP.j, df.SIM.j, df.target){
## emperical
df.EMP.j.f = df.EMP.j %>%
filter(abundance > 0) %>%
dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
mutate(dataset = 'emperical', SIM_rep = NA) %>%
filter(OTU %in% df.target$OTU)
## simulated
df.SIM.j.f = df.SIM.j %>%
filter(count > 0) %>%
dplyr::select(taxon, fraction, count, BD_mid, bulk_abund, SIM_rep) %>%
rename('OTU' = taxon,
'sample' = fraction,
'Buoyant_density' = BD_mid,
'abundance' = count) %>%
mutate(dataset = 'simulated') %>%
filter(OTU %in% df.target$OTU)
## getting just intersecting OTUs
OTUs.int = intersect(df.EMP.j.f$OTU, df.SIM.j.f$OTU)
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
filter(OTU %in% OTUs.int) %>%
group_by(sample) %>%
mutate(rel_abund = abundance / sum(abundance))
cat('Number of overlapping OTUs between emperical & simulated:',
df.j$OTU %>% unique %>% length, '\n\n')
return(df.j)
}
df.j = join_abund_dists(df.EMP.j, df.SIM.j, df.target)
df.j %>% head(n=3) %>% as.data.frame
In [339]:
%%R
# closure operation
df.j = df.j %>%
ungroup() %>%
mutate(SIM_rep = SIM_rep %>% as.numeric) %>%
group_by(dataset, SIM_rep, sample) %>%
mutate(rel_abund_c = rel_abund / sum(rel_abund)) %>%
ungroup()
df.j %>% head(n=3) %>% as.data.frame
In [340]:
%%R -h 1500 -w 800
# plotting
plot_abunds = function(df){
p = ggplot(df, aes(Buoyant_density, rel_abund_c, fill=OTU)) +
geom_area(stat='identity', position='dodge', alpha=0.5) +
labs(x='Buoyant density',
y='Subsampled community\n(relative abundance for subset taxa)') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank(),
plot.margin=unit(c(0.1,1,0.1,1), "cm")
)
return(p)
}
# simulations
df.j.f = df.j %>%
filter(dataset == 'simulated')
p.SIM = plot_abunds(df.j.f)
p.SIM = p.SIM + facet_grid(SIM_rep ~ .)
# emperical
df.j.f = df.j %>%
filter(dataset == 'emperical')
p.EMP = plot_abunds(df.j.f)
# make figure
grid.arrange(p.EMP, p.SIM, ncol=1, heights=c(1,5))
In [341]:
OTU_files = !find $buildDir -name "OTU_abs1e9.txt"
OTU_files
Out[341]:
In [342]:
%%R -i OTU_files
# loading files
df.SIM.abs = list()
for (x in OTU_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
SIM_rep = gsub('/OTU_abs1e9.txt', '', SIM_rep)
df.SIM.abs[[SIM_rep]] = read.delim(x, sep='\t')
}
df.SIM.abs = do.call('rbind', df.SIM.abs)
df.SIM.abs$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM.abs))
rownames(df.SIM.abs) = 1:nrow(df.SIM.abs)
df.SIM.abs %>% head
In [343]:
%%R
# subset just overlapping taxa
# & closure operation
df.SIM.abs.t = df.SIM.abs %>%
filter(taxon %in% df.target$OTU) %>%
group_by(SIM_rep, fraction) %>%
mutate(rel_abund_c = count / sum(count)) %>%
rename('Buoyant_density' = BD_mid,
'OTU' = taxon)
df.SIM.abs.t %>% head(n=3) %>% as.data.frame
In [344]:
%%R -w 800 -h 1200
# plotting
p.abs = plot_abunds(df.SIM.abs.t)
p.abs + facet_grid(SIM_rep ~ .)
In [351]:
%%R
center_mass = function(df){
df = df %>%
group_by(dataset, SIM_rep, OTU) %>%
summarize(center_mass = weighted.mean(Buoyant_density, rel_abund_c, na.rm=T)) %>%
ungroup()
return(df)
}
df.j.cm = center_mass(df.j)
In [353]:
%%R
# getting mean cm for all SIM_reps
df.j.cm.s = df.j.cm %>%
group_by(dataset, OTU) %>%
summarize(mean_cm = mean(center_mass, na.rm=T),
stdev_cm = sd(center_mass)) %>%
ungroup() %>%
spread(dataset, mean_cm) %>%
group_by(OTU) %>%
summarize(stdev_cm = mean(stdev_cm, na.rm=T),
emperical = mean(emperical, na.rm=T),
simulated = mean(simulated, na.rm=T)) %>%
ungroup()
# check
cat('Number of OTUs:', df.j.cm.s$OTU %>% unique %>% length, '\n')
# plotting
ggplot(df.j.cm.s, aes(emperical, simulated,
ymin = simulated - stdev_cm,
ymax = simulated + stdev_cm)) +
geom_pointrange() +
stat_function(fun = function(x) x, linetype='dashed', alpha=0.5, color='red') +
scale_x_continuous(limits=c(1.69, 1.74)) +
scale_y_continuous(limits=c(1.7, 1.75)) +
labs(title='Center of mass') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [383]:
%%R
BD_MIN = df.j$Buoyant_density %>% min
BD_MAX = df.j$Buoyant_density %>% max
BD_AVE = mean(c(BD_MIN, BD_MAX))
print(c(BD_MIN, BD_AVE, BD_MAX))
In [354]:
%%R
# formatting table
df.j.cm.j = inner_join(df.j.cm %>%
filter(dataset == 'simulated') %>%
rename('cm_SIM' = center_mass),
df.j.cm %>%
filter(dataset == 'emperical') %>%
rename('cm_EMP' = center_mass),
c('OTU' = 'OTU')) %>%
select(-starts_with('dataset'))
df.j.cm.j %>% head
In [355]:
%%R -w 300 -h 400
# lm()
df.j.cm.j.lm = df.j.cm.j %>%
group_by(SIM_rep.x) %>%
do(fit = lm(cm_EMP ~ cm_SIM, data = .)) %>%
mutate(R2 = summary(fit)$coeff[2],
data = 'simulated')
#df.j.cm.j.lm %>% head
# plotting
ggplot(df.j.cm.j.lm, aes(data, R2)) +
geom_boxplot() +
geom_jitter(height=0, width=0.2, color='red') +
labs(y='R^2', title='simulated ~ emperical') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [384]:
%%R -h 1100 -w 800
# cutoff on which OTU are major outliers (varying between simulated and emperical)
BD.diff.cut = 0.015
# which OTU to plot?
df.j.cm.s.f = df.j.cm.s %>%
mutate(cm_diff = abs(emperical - simulated)) %>%
filter(cm_diff > BD.diff.cut, ! is.na(simulated))
print('OTUs:')
print(df.j.cm.s.f$OTU)
# filtering to just target taxon
## Simulated
df.j.f = df.j %>%
filter(dataset == 'simulated',
OTU %in% df.j.cm.s.f$OTU)
p.SIM = plot_abunds(df.j.f)
p.SIM = p.SIM + facet_grid(SIM_rep ~ .)
## Emperical
df.j.f = df.j %>%
filter(dataset == 'emperical',
OTU %in% df.j.cm.s.f$OTU)
p.EMP = plot_abunds(df.j.f)
# make figure
grid.arrange(p.EMP, p.SIM, ncol=1, heights=c(1,5))
Abundant taxon: OTU.32
In [367]:
%%R
# subset outliers
df.SIM.abs.t = df.SIM.abs %>%
filter(taxon %in% df.target$OTU) %>%
group_by(SIM_rep, fraction) %>%
mutate(rel_abund_c = count / sum(count)) %>%
rename('Buoyant_density' = BD_mid,
'OTU' = taxon) %>%
filter(OTU %in% df.j.cm.s.f$OTU)
df.SIM.abs.t %>% head(n=3) %>% as.data.frame
In [368]:
%%R -w 800 -h 1200
# plotting
p.abs = plot_abunds(df.SIM.abs.t)
p.abs + facet_grid(SIM_rep ~ .)
In [373]:
%%R -w 800
genomes = df.target %>%
filter(OTU %in% df.SIM.abs.t$OTU)
df.genInfo = read.delim('/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_info.txt')
df.genInfo.f = df.genInfo %>%
filter(seq_name %in% genomes$genome_seqID) %>%
mutate(genome_ID = gsub('\\.fna', '', file_name))
df.genInfo.f$genome_ID = reorder(df.genInfo.f$genome_ID, df.genInfo.f$total_GC)
# plotting
ggplot(df.genInfo.f, aes(genome_ID, total_GC)) +
geom_point() +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=50, hjust=1)
)
In [370]:
%%R -w 800
df.genInfo = read.delim('/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_info.txt')
df.genInfo.f = df.genInfo %>%
filter(seq_name %in% df.target$genome_seqID) %>%
mutate(genome_ID = gsub('\\.fna', '', file_name))
df.genInfo.f$genome_ID = reorder(df.genInfo.f$genome_ID, df.genInfo.f$total_GC)
# plotting
ggplot(df.genInfo.f, aes(genome_ID, total_GC)) +
geom_point() +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_blank()
)
In [18]:
%%R
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'
In [19]:
%%R -i physeqDir -i physeq_SIP_core -i bulk_days
# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' &
physeq.SIP.core.m$Day %in% bulk_days,
physeq.SIP.core) %>%
filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core
In [20]:
%%R -w 800 -h 300
## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%
gather(sample, abundance, 1:(ncol(df.EMP)-1))
df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample'))
df.EMP.nt = df.EMP %>%
group_by(sample) %>%
mutate(n_taxa = sum(abundance > 0)) %>%
ungroup() %>%
distinct(sample) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
## plotting
p = ggplot(df.EMP.nt, aes(Buoyant_density, n_taxa)) +
geom_point(color='blue') +
geom_line(color='blue') +
#geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [21]:
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files
Out[21]:
In [38]:
%%R -i OTU_files
# loading files
df.SIM = list()
for (x in OTU_files){
SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
df.SIM[[SIM_rep]] = read.delim(x, sep='\t')
}
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head
In [40]:
%%R
## edit table
df.SIM.nt = df.SIM %>%
filter(count > 0) %>%
group_by(SIM_rep, library, BD_mid) %>%
summarize(n_taxa = n()) %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.nt %>% head
In [53]:
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa, group=SIM_rep)) +
geom_point(color='red') +
geom_line(color='red', alpha=0.5) +
#geom_smooth(color='red') +
geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
#geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [54]:
%%R -w 800 -h 300
# normalized by max number of taxa
## edit table
df.SIM.nt = df.SIM.nt %>%
group_by(SIM_rep) %>%
mutate(n_taxa_norm = n_taxa / max(n_taxa))
df.EMP.nt = df.EMP.nt %>%
group_by() %>%
mutate(n_taxa_norm = n_taxa / max(n_taxa))
## plot
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa_norm, group=SIM_rep)) +
geom_point(color='red') +
geom_line(color='red') +
geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
#geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
scale_y_continuous(limits=c(0, 1)) +
labs(x='Buoyant density', y='Number of taxa\n(fraction of max)') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [58]:
%%R -w 800 -h 300
# simulated
df.SIM.s = df.SIM %>%
group_by(SIM_rep, library, BD_mid) %>%
summarize(total_abund = sum(count)) %>%
rename('Day' = library, 'Buoyant_density' = BD_mid) %>%
ungroup() %>%
mutate(dataset='simulated')
# emperical
df.EMP.s = df.EMP %>%
group_by(Day, Buoyant_density) %>%
summarize(total_abund = sum(abundance)) %>%
ungroup() %>%
mutate(dataset='emperical', SIM_rep = NA)
# join
df.j = rbind(df.SIM.s, df.EMP.s) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
df.SIM.s = df.EMP.s = ""
# plot
ggplot(df.j, aes(Buoyant_density, total_abund, color=dataset, group=SIM_rep)) +
geom_point() +
geom_line(alpha=0.3) +
geom_line(data=df.j %>% filter(dataset=='emperical')) +
scale_color_manual(values=c('blue', 'red')) +
labs(x='Buoyant density', y='Total sequences per sample') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [59]:
%%R
shannon_index_long = function(df, abundance_col, ...){
# calculating shannon diversity index from a 'long' formated table
## community_col = name of column defining communities
## abundance_col = name of column defining taxon abundances
df = df %>% as.data.frame
cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
df.s = df %>%
group_by_(...) %>%
mutate_(REL_abundance = cmd) %>%
mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
ungroup() %>%
dplyr::select(-REL_abundance, -pi__ln_pi) %>%
distinct_(...)
return(df.s)
}
In [60]:
%%R
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM, 'count', 'SIM_rep', 'library', 'fraction') %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
In [62]:
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.shan, aes(BD_mid, shannon, group=SIM_rep)) +
geom_point(color='red') +
geom_line(color='red', alpha=0.3) +
geom_point(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
geom_line(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
scale_y_continuous(limits=c(4, 7.5)) +
labs(x='Buoyant density', y='Shannon index') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [65]:
%%R -h 300 -w 800
# simulated
df.SIM.s = df.SIM %>%
filter(rel_abund > 0) %>%
group_by(SIM_rep, BD_mid) %>%
summarize(min_abund = min(rel_abund),
max_abund = max(rel_abund)) %>%
ungroup() %>%
rename('Buoyant_density' = BD_mid) %>%
mutate(dataset = 'simulated')
# emperical
df.EMP.s = df.EMP %>%
group_by(Buoyant_density) %>%
mutate(rel_abund = abundance / sum(abundance)) %>%
filter(rel_abund > 0) %>%
summarize(min_abund = min(rel_abund),
max_abund = max(rel_abund)) %>%
ungroup() %>%
mutate(dataset = 'emperical',
SIM_rep = NA)
df.j = rbind(df.SIM.s, df.EMP.s) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
# plotting
ggplot(df.j, aes(Buoyant_density, max_abund, color=dataset, group=SIM_rep)) +
geom_point() +
geom_line(alpha=0.3) +
geom_line(data=df.j %>% filter(dataset=='emperical')) +
scale_color_manual(values=c('blue', 'red')) +
labs(x='Buoyant density', y='Maximum relative abundance') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [67]:
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files
Out[67]:
In [74]:
%%R -i comm_files
df.comm = list()
for (f in comm_files){
rep = gsub('.+/Day1_rep10/([0-9]+)/.+', '\\1', f)
df.comm[[rep]] = read.delim(f, sep='\t') %>%
dplyr::select(library, taxon_name, rel_abund_perc) %>%
rename('bulk_abund' = rel_abund_perc) %>%
mutate(bulk_abund = bulk_abund / 100)
}
df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)
In [75]:
%%R
## joining
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
'library' = 'library',
'taxon' = 'taxon_name')) %>%
filter(BD_mid >= min_BD,
BD_mid <= max_BD)
df.SIM.j %>% head(n=3)
In [107]:
%%R
bulk_days = c(1)
In [108]:
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
physeq.bulk.m$Day %in% bulk_days, physeq.bulk)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk
In [76]:
%%R
bulk_days = c(1)
In [77]:
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
physeq.bulk.m$Day %in% bulk_days, physeq.bulk)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk
In [78]:
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n
In [79]:
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
gather(sample, abundance, 1:ncol)
bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
dplyr::select(OTU, abundance) %>%
rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)
In [81]:
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
df.EMP.j %>% head(n=3)
In [102]:
%%R
# filtering & combining emperical w/ simulated data
## emperical
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
filter(abundance > 0) %>%
group_by(OTU) %>%
summarize(mean_rel_abund = mean(bulk_abund),
min_BD = min(Buoyant_density),
max_BD = max(Buoyant_density),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / max_BD_range * 100) %>%
ungroup() %>%
mutate(dataset = 'emperical',
SIM_rep = NA)
## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
filter(count > 0) %>%
group_by(SIM_rep, taxon) %>%
summarize(mean_rel_abund = mean(bulk_abund),
min_BD = min(BD_mid),
max_BD = max(BD_mid),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / max_BD_range * 100) %>%
ungroup() %>%
rename('OTU' = taxon) %>%
mutate(dataset = 'simulated')
## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
filter(BD_range_perc > 0,
mean_rel_abund > 0)
df.j %>% head(n=3)
In [103]:
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
geom_point(alpha=0.5, shape='O') +
scale_x_log10() +
scale_y_continuous() +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank()#,
#legend.position = 'none'
)
In [106]:
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=dataset)) +
#geom_point(alpha=0.5, shape='O') +
stat_density2d() +
scale_x_log10() +
scale_y_continuous() +
scale_color_manual(values=c('blue', 'red')) +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
#geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
In [119]:
%%R -i targetFile
df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)
In [145]:
%%R
# filtering to just target taxa
df.j.t = df.j %>%
filter(OTU %in% df.target$OTU)
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print
## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
geom_point(alpha=0.5, shape='O') +
scale_x_log10() +
scale_y_continuous() +
#scale_color_manual(values=c('blue', 'red')) +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank()#,
#legend.position = 'none'
)
In [150]:
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>%
filter(dataset == 'simulated') %>%
select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)
df.2 = df.j.t %>%
filter(dataset == 'emperical') %>%
select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)
df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
mutate(BD_diff_perc = BD_range_perc.x - BD_range_perc.y)
df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x %>% as.numeric)
## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
geom_point(alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
facet_wrap(~ SIM_rep.x)
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
p1
In [151]:
%%R -w 800 -h 500
ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
geom_point(alpha=0.5) +
scale_x_log10() +
labs(x='Relative abundance',
y='Difference in % of gradient spanned\n(simulated vs emperical)',
title='Overlapping taxa') +
facet_wrap(~ SIM_rep.x) +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
In [ ]:
In [113]:
%%R -w 800 -h 400
plot_abund_dists = function(df.EMP.j, df.SIM.j, sim_rep){
## emperical
df.EMP.j.f = df.EMP.j %>%
filter(abundance > 0) %>%
dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
mutate(dataset = 'emperical')
## simulated
df.SIM.j.f = df.SIM.j %>%
filter(SIM_rep == sim_rep) %>%
filter(count > 0) %>%
dplyr::select(taxon, fraction, count, BD_mid, bulk_abund) %>%
rename('OTU' = taxon,
'sample' = fraction,
'Buoyant_density' = BD_mid,
'abundance' = count) %>%
mutate(dataset = 'simulated')
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
group_by(sample) %>%
mutate(rel_abund = abundance / sum(abundance))
title = paste0('Simulation Replicate ', sim_rep)
p = ggplot(df.j, aes(Buoyant_density, abundance, fill=OTU)) +
geom_area(stat='identity', position='dodge', alpha=0.5) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)', title=title) +
facet_grid(dataset ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank(),
plot.margin=unit(c(0.1,1,0.1,1), "cm")
)
return(p)
}
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=1)
In [114]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=2)
In [115]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=3)
In [116]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=4)
In [117]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=5)
In [118]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=6)
In [132]:
%%R -w 800 -h 400
plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=10)