In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/'
physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'
ampFrag_KDE_info = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde_info.txt'
In [2]:
import os
import sys
%load_ext rpy2.ipython
%load_ext pushnote
In [3]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)
library(vegan)
library(robCompositions)
In [4]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [5]:
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')
In [6]:
%%R -i ampFrag_KDE_info
ntaxa = read.delim(ampFrag_KDE_info, sep='\t') %>%
filter(! is.nan(KDE_ID)) %>%
distinct(taxon_ID) %>% nrow
cat('Number of taxa that have amplicon-fragments: ', ntaxa, '\n')
In [8]:
%%R -i workDir
F = file.path(workDir, 'ampFrags_kde_amplified.txt')
taxa = read.delim(ampFrag_KDE_info, sep='\t') %>%
filter(! is.nan(KDE_ID)) %>%
select(taxon_ID) %>%
distinct(taxon_ID) %>%
rename('genomeID' = taxon_ID)
write.table(taxa, F, quote=FALSE, row.names=FALSE, col.names=TRUE)
cat('File written:', F, '\n')
In [17]:
%%R -i physeqDir -i physeq_bulkCore
physeq.file = file.path(physeqDir, physeq_bulkCore)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
physeq.bulk.m$Substrate == '12C-Con', physeq.bulk)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk
In [18]:
%%R -i physeqDir -i physeq_SIP_core
# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con',
physeq.SIP.core) %>%
filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data
physeq.SIP.core
In [19]:
%%R
df.bulk.otu = physeq.bulk %>%
otu_table %>%
as.data.frame
df.bulk.otu$OTU = rownames(df.bulk.otu)
df.bulk.otu = df.bulk.otu %>%
gather(sample, count, ends_with('_bulk')) %>%
group_by(sample) %>%
mutate(rank_abund = row_number(-count)) %>%
ungroup() %>%
filter(rank_abund <= ntaxa) %>%
arrange(rank_abund)
df.bulk.otu %>% nrow %>% print
df.bulk.otu$OTU %>% unique %>% length %>% print
df.bulk.otu %>% head(n=3)
In [20]:
%%R
# filtering taxa from fraction samples & performing total-sum scaling (closure operation)
bulk.samples = df.bulk.otu$sample %>% unique
physeq.SIP.l = list()
for (bs in bulk.samples){
df.otus = df.bulk.otu %>%
filter(sample == bs)
day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', bs) %>% as.numeric
physeq.SIP.l[[bs]] = prune_samples(physeq.SIP.core.m$Day == day, physeq.SIP.core)
physeq.SIP.l[[bs]] = prune_taxa(df.otus$OTU, physeq.SIP.l[[bs]]) %>%
transform_sample_counts(function(x) x/sum(x))
}
physeq.SIP.l
In [12]:
%%R -i workDir
outFile = 'SIP-core_unk_trm'
outFile = file.path(workDir, outFile)
saveRDS(physeq.SIP.l, outFile)
cat('File written:', outFile, '\n')
In [21]:
%%R
# filtering taxa from fraction samples & performing total-sum scaling (closure operation)
bulk.samples = df.bulk.otu$sample %>% unique
physeq.bulk.l = list()
for (bs in bulk.samples){
df.otus = df.bulk.otu %>%
filter(sample == bs)
day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', bs) %>% as.numeric
physeq.bulk.l[[bs]] = prune_samples(physeq.bulk.m$Day == day, physeq.bulk)
physeq.bulk.l[[bs]] = prune_taxa(df.otus$OTU, physeq.bulk.l[[bs]]) %>%
transform_sample_counts(function(x) x/sum(x))
}
physeq.bulk.l
In [14]:
%%R -i workDir
outFile = 'bulk-core_trm'
outFile = file.path(workDir, outFile)
saveRDS(physeq.bulk.l, outFile)
cat('File written:', outFile, '\n')
In [22]:
%%R -i workDir
F = file.path(workDir, 'SIP-core_unk_trm')
physeq.SIP.l = readRDS(F)
physeq.SIP.l %>% names
In [23]:
%%R
otu2df = function(x){
x %>% otu_table %>% as.data.frame
}
otu.l = lapply(physeq.SIP.l, otu2df)
samps = names(otu.l)
df.SIP.otu = otu.l[[samps[1]]]
df.SIP.otu$OTU = rownames(df.SIP.otu)
for (x in samps[2:length(samps)]){
y = otu.l[[x]]
y$OTU = rownames(y)
df.SIP.otu = left_join(df.SIP.otu, y, c('OTU' = 'OTU'))
}
otu.l = NULL
df.SIP.otu[is.na(df.SIP.otu)] = 0
df.SIP.otu[1:5, 1:5]
In [24]:
%%R
# convert physeq object to a dataframe
df.EMP = df.SIP.otu %>%
gather(sample, abundance, starts_with('12C-Con'))
df.EMP = inner_join(df.EMP, physeq.SIP.core %>% sample_data, c('sample' = 'X.Sample')) %>%
mutate(Day = Day %>% as.character)
df.EMP$Day = reorder(df.EMP$Day, df.EMP$Day %>% as.numeric)
df.EMP$Day %>% unique %>% print
df.EMP %>% head(n=3)
In [25]:
%%R -i workDir
F = file.path(workDir, 'bulk-core_trm')
physeq.bulk.l = readRDS(F)
physeq.bulk.l %>% names
In [26]:
%%R
otu2df = function(x){
x %>% otu_table %>% as.data.frame
}
otu.l = lapply(physeq.bulk.l, otu2df)
samps = names(otu.l)
df.bulk.otu = otu.l[[samps[1]]]
df.bulk.otu$OTU = rownames(df.bulk.otu)
for (x in samps[2:length(samps)]){
y = otu.l[[x]]
y$OTU = rownames(y)
df.bulk.otu = left_join(df.bulk.otu, y, c('OTU' = 'OTU'))
}
otu.l = NULL
df.bulk.otu[is.na(df.bulk.otu)] = 0
df.bulk.otu = df.bulk.otu %>%
gather(sample, abundance, starts_with('12C-Con')) %>%
mutate(Day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', sample)) %>%
rename('preFrac_abund' = abundance,
'preFrac_sample' = sample)
df.bulk.otu %>% head(n=3)
In [27]:
%%R
# joining SIP & bulk
df.EMP.j = inner_join(df.EMP, df.bulk.otu, c('OTU' = 'OTU',
'Day' = 'Day'))
df.EMP.j %>% head(n=3) %>% as.data.frame
In [28]:
%%R
#max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
filter(abundance > 0) %>%
group_by(Day) %>%
mutate(max_BD_range = max(Buoyant_density) - min(Buoyant_density)) %>%
ungroup() %>%
group_by(OTU, Day) %>%
summarize(mean_preFrac_abund = mean(preFrac_abund),
min_BD = min(Buoyant_density),
max_BD = max(Buoyant_density),
BD_range = max_BD - min_BD,
BD_range_perc = BD_range / first(max_BD_range) * 100) %>%
ungroup()
df.EMP.j.f %>% head
In [29]:
%%R -h 400 -w 800
df.EMP.j.f$Day = reorder(df.EMP.j.f$Day, df.EMP.j.f$Day %>% as.numeric)
## plotting
ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc, color=Day)) +
geom_point(alpha=0.5, shape='O') +
scale_x_log10() +
scale_y_continuous() +
facet_wrap(~ Day) +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank()#,
#legend.position = 'none'
)
In [55]:
%%R -h 400 -w 750
df.EMP.j.f$Day = reorder(df.EMP.j.f$Day, df.EMP.j.f$Day %>% as.numeric)
## plotting
p.span.hex = ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc)) +
geom_hex() +
scale_x_log10() +
scale_y_continuous(limits=c(0,105)) +
scale_fill_gradient(low='grey95', high='black') +
facet_wrap(~ Day, ncol=3) +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
p.span.hex
In [32]:
%%R -h 350 -w 550
## plotting
ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc)) +
geom_hex() +
scale_x_log10() +
scale_y_continuous(limits=c(0,105)) +
scale_fill_gradient(low='grey95', high='black') +
labs(x='Pre-fractionation abundance', y='% of total BD range') +
theme_bw() +
theme(
text = element_text(size=16),
panel.grid = element_blank(),
legend.position = 'none'
)
In [33]:
%%R
# giving value to missing abundances
min.pos.val = df.EMP %>%
filter(abundance > 0) %>%
group_by() %>%
mutate(min_abund = min(abundance)) %>%
ungroup() %>%
filter(abundance == min_abund)
min.pos.val = min.pos.val[1,'abundance'] %>% as.numeric
imp.val = min.pos.val / 10
# convert numbers
#df.EMP[df.EMP$abundance == 0, 'abundance'] = imp.val
# another closure operation
df.EMP = df.EMP %>%
group_by(sample) %>%
mutate(abundance = abundance / sum(abundance),
Day = Day %>% as.character)
df.EMP$Day = reorder(df.EMP$Day, df.EMP$Day %>% as.numeric)
# status
cat('Below detection level abundances converted to: ', imp.val, '\n')
In [34]:
%%R
# shannon index
shannon_index_long = function(df, abundance_col, ...){
# calculating shannon diversity index from a 'long' formated table
## community_col = name of column defining communities
## abundance_col = name of column defining taxon abundances
## ... = group_by columns
df = df %>% as.data.frame
cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
df.s = df %>%
group_by_(...) %>%
mutate_(REL_abundance = cmd) %>%
mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
ungroup() %>%
dplyr::select(-REL_abundance, -pi__ln_pi) %>%
distinct_(...)
return(df.s)
}
In [35]:
%%R
# calulcating
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
filter(Buoyant_density >= min_BD,
Buoyant_density <= max_BD)
df.EMP.shan %>% head(n=3) %>% as.data.frame
In [39]:
%%R -w 800 -h 300
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
# plotting
p.shan = ggplot(df.EMP.shan, aes(Buoyant_density, shannon, color=Day, group=Day)) +
geom_point() +
geom_line(alpha=0.3) +
labs(x=x.lab, y='Shannon index') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.shan
In [40]:
%%R -w 800 -h 300
# summary plot
df.EMP.shan.s = df.EMP.shan %>%
group_by(BD_bin = ntile(Buoyant_density, 24)) %>%
summarize(mean_BD = mean(Buoyant_density),
mean_shannon = mean(shannon),
sd_shannon = sd(shannon))
# plotting
p = ggplot(df.EMP.shan.s, aes(mean_BD, mean_shannon,
ymin=mean_shannon-sd_shannon,
ymax=mean_shannon+sd_shannon)) +
geom_pointrange() +
labs(x='Buoyant density (binned; 24 bins)', y='Mean Shannon index (+/- stdev)') +
theme_bw() +
theme(
text = element_text(size=16)
)
p
In [41]:
%%R -w 800 -h 350
df.EMP.var = df.EMP %>%
group_by(Day, sample) %>%
mutate(variance = var(abundance)) %>%
ungroup() %>%
distinct(Day, sample) %>%
select(Day, sample, variance, Buoyant_density)
ggplot(df.EMP.var, aes(Buoyant_density, variance, color=Day)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [42]:
%%R -w 800 -h 350
df.EMP.var.s = df.EMP.var %>%
group_by(BD_bin = ntile(Buoyant_density, 24)) %>%
summarize(mean_BD = mean(Buoyant_density),
mean_var = mean(variance),
sd_var = sd(variance))
ggplot(df.EMP.var.s, aes(mean_BD, mean_var,
ymin=mean_var-sd_var,
ymax=mean_var+sd_var)) +
geom_pointrange() +
labs(x='Buoyant density (binned; 24 bins)', y='Mean variance (+/- stdev)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [43]:
%%R
BD.diffs = function(BDs){
df.BD = expand.grid(BDs, BDs)
df.BD$diff = df.BD %>% apply(1, diff) %>% abs %>% as.vector
df.BD = df.BD %>%
spread(Var1, diff)
rownames(df.BD) = df.BD$Var2
df.BD$Var2 = NULL
dist.BD = df.BD %>% as.matrix
dist.BD[upper.tri(dist.BD, diag=TRUE)] = 0
dist.BD %>% as.dist
}
days = df.EMP$Day %>% unique
BDs.l = list()
for(d in days){
tmp = df.EMP %>% filter(Day == d)
BDs.l[[d]] = tmp$Buoyant_density %>% unique
}
BD.dist.l = lapply(BDs.l, BD.diffs)
BD.dist.l %>% names
In [44]:
%%R
vegdist.by = function(day, df.EMP, ...){
df.EMP.w = df.EMP %>%
filter(Day == day) %>%
select(OTU, sample, abundance) %>%
spread(OTU, abundance) %>%
as.data.frame()
rownames(df.EMP.w) = df.EMP.w$sample
df.EMP.w$sample = NULL
vegan::vegdist(df.EMP.w, ...)
}
days = df.EMP$Day %>% unique %>% sort
bray.l = list()
for (d in days){
bray.l[[d]] = vegdist.by(d, df.EMP, method='bray')
}
bray.l %>% names
In [45]:
%%R -w 800 -h 650
cal.corr = function(d, BDs, nclass=24, ...){
#d = vegdist(df, ...)
d[is.nan(d)] = 0
stopifnot((d %>% length) == (BDs %>% length))
mantel.correlog(d, BDs, n.class=nclass)
}
days = df.EMP$Day %>% unique %>% sort
corr.l = list()
for(d in days){
corr.l[[d]] = cal.corr(bray.l[[d]], BD.dist.l[[d]])
}
par(mfrow=c(3,3))
for (n in names(corr.l)){
plot(corr.l[[n]])
title(n)
}
In [46]:
%%R
# making df of mantel values
corr.l.mantel = list()
for (n in corr.l %>% names){
corr.l.mantel[[n]] = corr.l[[n]]['mantel.res'][[1]] %>% as.data.frame
corr.l.mantel[[n]]$Day = n
colnames(corr.l.mantel[[n]]) = c('class.index', 'n.dist', 'Mantel.corr', 'Pr', 'Pr.corr', 'Day')
}
df.corr = do.call(rbind, corr.l.mantel) %>% as.data.frame
df.corr %>% head
In [49]:
%%R -w 700 -h 250
df.corr.e = df.corr %>%
filter(! is.na(Mantel.corr)) #%>%
# mutate(Pr.corr = ifelse(is.na(Pr.corr), 1, Pr.corr) %>% as.numeric,
# Pr.corr.bool = ifelse(Pr.corr < 0.05, '<0.05', '>0.05'))
df.corr.e$Day = reorder(df.corr.e$Day, df.corr.e$Day %>% as.numeric)
p.corr = ggplot(df.corr.e, aes(class.index, Mantel.corr, color=Day)) +
geom_point(size=2) +
geom_line(alpha=0.5) +
labs(x='Class index', y='Mantel correlation') +
theme_bw() +
theme(
text = element_text(size=16)
)
p.corr
In [48]:
%%R -w 650 -h 300
# summarizing
df.corr.s = df.corr %>%
filter(! is.na(Mantel.corr)) %>%
group_by(bin = ntile(class.index, 12)) %>%
summarize(n = n(),
mean.class.index = mean(class.index),
mean.Mantel.corr = mean(Mantel.corr, na.rm=TRUE),
sd.Mantel.corr = sd(Mantel.corr, na.rm=TRUE),
max.Pr.corr = max(Pr.corr, na.rm=TRUE)) %>%
ungroup() %>%
mutate(significant = ifelse(max.Pr.corr <= 0.05, TRUE, FALSE))
# plotting
ggplot(df.corr.s, aes(mean.class.index, mean.Mantel.corr, color=significant,
ymin=mean.Mantel.corr-sd.Mantel.corr,
ymax=mean.Mantel.corr+sd.Mantel.corr)) +
geom_pointrange() +
scale_color_discrete('P_corr < 0.05') +
labs(x='Class index (binned; 12 bins)', y='Mean Mantel correlation\n(+/- stdev)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [68]:
%%R -w 600 -h 700
p.comb = cowplot::ggdraw() +
geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
cowplot::draw_plot(p.shan, 0.04, 0.75, 0.96, 0.25) +
cowplot::draw_plot(p.corr, 0.04, 0.50, 0.96, 0.25) +
cowplot::draw_plot(p.span.hex, 0.04, 0.0, 0.93, 0.50) +
cowplot::draw_plot_label(c('A)', 'B)', 'C)'), c(0, 0, 0), c(1, 0.75, 0.5))
p.comb
In [69]:
%%R -i workDir
# saving plot
F = file.path(workDir, 'emp-data_summary.pdf')
ggsave(F, p.comb, width=8.5, height=10)
cat('File written:', F, '\n')
In [116]:
%%R
binary.l = list()
for (d in days){
binary.l[[d]] = vegdist.by(d, df.EMP, binary=TRUE)
}
binary.l %>% names
In [117]:
%%R -w 800 -h 650
days = df.EMP$Day %>% unique %>% sort
corr.l = list()
for(d in days){
corr.l[[d]] = cal.corr(binary.l[[d]], BD.dist.l[[d]])
}
par(mfrow=c(3,3))
for (n in names(corr.l)){
plot(corr.l[[n]])
title(n)
}
In [118]:
%%R
# making df of mantel values
corr.l.mantel = list()
for (n in corr.l %>% names){
corr.l.mantel[[n]] = corr.l[[n]]['mantel.res'][[1]] %>% as.data.frame
corr.l.mantel[[n]]$Day = n
colnames(corr.l.mantel[[n]]) = c('class.index', 'n.dist', 'Mantel.corr', 'Pr', 'Pr.corr', 'Day')
}
df.corr = do.call(rbind, corr.l.mantel) %>% as.data.frame
df.corr %>% head
In [119]:
%%R -w 650 -h 300
# summarizing
df.corr.s = df.corr %>%
filter(! is.na(Mantel.corr)) %>%
group_by(bin = ntile(class.index, 12)) %>%
summarize(n = n(),
mean.class.index = mean(class.index),
mean.Mantel.corr = mean(Mantel.corr, na.rm=TRUE),
sd.Mantel.corr = sd(Mantel.corr, na.rm=TRUE),
max.Pr.corr = max(Pr.corr, na.rm=TRUE)) %>%
ungroup() %>%
mutate(significant = ifelse(max.Pr.corr <= 0.05, TRUE, FALSE))
# plotting
ggplot(df.corr.s, aes(mean.class.index, mean.Mantel.corr, color=significant,
ymin=mean.Mantel.corr-sd.Mantel.corr,
ymax=mean.Mantel.corr+sd.Mantel.corr)) +
geom_pointrange() +
scale_color_discrete('P_corr < 0.05') +
labs(x='Class index (binned; 12 bins)', y='Mean Mantel correlation\n(+/- stdev)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [161]:
%%R -w 800
df.EMP.f = df.EMP %>%
filter(Buoyant_density > 1.72) %>%
group_by(Day, sample) %>%
mutate(rank = row_number(-abundance)) %>%
ungroup() %>%
filter(rank <= 5)
ggplot(df.EMP.f, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
geom_line(alpha=0.5, size=1.2) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)',
title='Abundant taxa in the heavy fractions') +
facet_grid(Day ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)
In [162]:
%%R -w 800
df.EMP.f2 = df.EMP %>%
filter(OTU %in% df.EMP.f$OTU)
ggplot(df.EMP.f2, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
geom_line(alpha=0.5, size=1.2) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)',
title='Abundant taxa in the heavy fractions') +
facet_grid(Day ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)
In [163]:
%%R -w 800
ggplot(df.EMP, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
geom_line(alpha=0.5, size=1.2) +
labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)',
title='All taxa in the trimmed 12C-Con fullCyc dataset') +
facet_grid(Day ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none',
axis.title.y = element_text(vjust=1),
axis.title.x = element_blank()
)