In [39]:
workDir = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/atomIncorp_fracSize/'
In [40]:
library(dplyr)
library(tidyr)
library(ggplot2)
as.Num = function(x) x %>% as.character %>% as.numeric
In [41]:
# files on simulation accuracy
files = list.files(path=workDir, pattern='fracs.txt', full.names=TRUE, recursive=TRUE)
files %>% length
In [42]:
df_fracs = list()
for (f in files){
ff = strsplit(f, '/') %>% unlist
df = read.delim(f, sep='\t')
df$rep = ff[length(ff)-1]
df$frac_mu = ff[length(ff)-2]
df$atomIncorp = ff[length(ff)-3]
df_fracs[[f]] = df
}
df_fracs = do.call(rbind, df_fracs)
rownames(df_fracs) = 1:nrow(df_fracs)
# status
df_fracs %>% dim %>% print
df_fracs %>% head(n=3)
In [43]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')
In [44]:
# filtering to just within min-max BD
df_fracs %>% nrow %>% print
df_fracs = df_fracs %>%
filter(BD_min >= min_BD,
BD_max <= max_BD)
df_fracs %>% nrow %>% print
In [58]:
# plotting
options(repr.plot.width=4, repr.plot.height=3)
p_size = ggplot(df_fracs, aes(frac_mu, fraction_size)) +
geom_boxplot() +
labs(x='Mean fraction size (g ml^-1)', y='Fraction size (g ml^-1)') +
theme_bw()
plot(p_size)
In [59]:
df_fracs_s = df_fracs %>%
mutate(frac_mu = frac_mu %>% as.character) %>%
group_by(library, rep, frac_mu, atomIncorp) %>%
summarize(n_fracs = fraction %>% unique %>% length) %>%
ungroup() %>%
mutate(frac_mu = frac_mu %>% as.character)
# plotting
options(repr.plot.width=4, repr.plot.height=3)
p_fracs = ggplot(df_fracs_s, aes(frac_mu, n_fracs)) +
geom_boxplot() +
labs(x='Mean fraction size (g ml^-1)', y='Number of fractions\nper gradient') +
theme_bw()
plot(p_fracs)
In [72]:
options(repr.plot.width=8, repr.plot.height=3)
p_size_fracs = cowplot::plot_grid(p_size, p_fracs, labels = c("A)", "B)"))
plot(p_size_fracs)
In [73]:
outF = file.path(workDir, 'atomIncorp_fracSize_nFracs.pdf')
ggsave(outF, p_size_fracs, width=8, height=3)
cat('File written:', outF, '\n')
In [74]:
outF = file.path(workDir, 'atomIncorp_fracSize_nFracs.jpeg')
ggsave(outF, p_size_fracs, width=8, height=3)
cat('File written:', outF, '\n')
In [63]:
# files on simulation accuracy
files = list.files(path=workDir, pattern='*-cMtx_byClass.txt', full.names=TRUE)
files
In [64]:
# combining files
df_byClass = list()
for (f in files){
ff = strsplit(f, '/') %>% unlist
fff = ff[length(ff)]
df_byClass[[fff]] = read.delim(f, sep='\t')
}
df_byClass = do.call(rbind, df_byClass)
df_byClass$file = gsub('\\.[0-9]+$', '', rownames(df_byClass))
df_byClass$method = gsub('-.+', '', df_byClass$file)
rownames(df_byClass) = 1:nrow(df_byClass)
df_byClass %>% head(n=3)
In [65]:
# renaming methods
rename = data.frame(method = c('DESeq2', 'DESeq2_multi', 'heavy', 'qSIP'),
method_new = c('HR-SIP', 'MW-HR-SIP', 'Heavy-SIP', 'q-SIP'))
df_byClass = inner_join(df_byClass, rename, c('method'='method')) %>%
select(-method) %>%
rename('method' = method_new)
df_byClass %>% dim %>% print
df_byClass %>% head(n=3)
In [66]:
# adding number of fractions per gradient
## mean n-fracs
df_fracs_s = df_fracs %>%
mutate(frac_mu = frac_mu %>% as.character) %>%
group_by(library, rep, frac_mu, atomIncorp) %>%
summarize(n_fracs = fraction %>% unique %>% length) %>%
group_by(frac_mu) %>%
summarize(n_fracs = mean(n_fracs)) %>%
ungroup() %>%
mutate(frac_mu = frac_mu %>% as.character)
## joining
df_byClass = df_byClass %>%
inner_join(df_fracs_s %>%
dplyr::select(frac_mu, n_fracs) %>%
mutate(frac_mu = frac_mu %>% as.Num),
c('frac_mu' = 'frac_mu'))
df_byClass %>% dim %>% print
df_byClass %>% head(n=3)
In [67]:
# summarize by SIPSim rep & library rep
df_byClass.s = df_byClass %>%
group_by(method, percIncorp, frac_mu, n_fracs, variables) %>%
summarize(mean_value = mean(values, na.rm=TRUE),
sd_value = sd(values, na.rm=TRUE))
# plotting
options(repr.plot.width=8, repr.plot.height=5)
p = ggplot(df_byClass.s, aes(variables, mean_value, color=method,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
geom_pointrange(alpha=0.8, size=0.2) +
labs(y='Value') +
facet_grid(frac_mu ~ percIncorp) +
theme_bw() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle=65, hjust=1)
)
plot(p)
In [68]:
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
filter(variables %in% vars) %>%
ungroup() %>%
mutate(frac_mu = frac_mu %>% as.character,
frac_mu = frac_mu %>% reorder(frac_mu %>% as.Num),
n_fracs = round(n_fracs, 0) %>% as.character)
# plotting
options(repr.plot.width=9, repr.plot.height=5)
p.pnt1 = ggplot(df_byClass.s.f, aes(percIncorp, mean_value,
color=frac_mu,
group=frac_mu,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
geom_point(alpha=0.8) +
geom_linerange(alpha=0.8, size=0.5) +
geom_line() +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='atom % excess 13C') +
facet_grid(variables ~ method) +
theme_bw() +
theme(
axis.title.y = element_blank()
)
plot(p.pnt1)
In [69]:
# summarize by SIPSim rep & library rep
vars = c('Balanced Accuracy', 'Sensitivity', 'Specificity')
df_byClass.s.f = df_byClass.s %>%
filter(variables %in% vars) %>%
ungroup() %>%
mutate(frac_mu = frac_mu %>% as.character,
frac_mu = frac_mu %>% reorder(frac_mu %>% as.Num),
n_fracs = round(n_fracs, 0) %>% as.character,
n_fracs = n_fracs %>% reorder(n_fracs %>% as.Num))
# plotting
options(repr.plot.width=9, repr.plot.height=5)
p.pnt2 = ggplot(df_byClass.s.f, aes(percIncorp, mean_value,
color=n_fracs,
group=n_fracs,
ymin=mean_value-sd_value,
ymax=mean_value+sd_value)) +
geom_point(alpha=0.8) +
geom_linerange(alpha=0.8, size=0.5) +
geom_line() +
scale_color_discrete('Number of\nfractions\nper gradient') +
labs(x='atom % excess 13C') +
facet_grid(variables ~ method) +
theme_bw() +
theme(
axis.title.y = element_blank()
)
plot(p.pnt2)
In [70]:
outF = file.path(workDir, 'atomIncorp_fracSize_acc.pdf')
ggsave(outF, p.pnt1, width=10, height=6)
cat('File written:', outF, '\n')
In [71]:
outF = file.path(workDir, 'atomIncorp_fracSize_acc.jpeg')
ggsave(outF, p.pnt1, width=10, height=6)
cat('File written:', outF, '\n')
In [18]:
BDshift_files = list.files(path=workDir, pattern='BD-shift_stats.txt', full.names=TRUE, recursive=TRUE)
BDshift_files %>% length %>% print
In [19]:
df_shift = list()
for(F in BDshift_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$percIncorp = FF[FFl-3]
tmp$frac_mu = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_shift[[F]] = tmp
}
df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)
df_shift = df_shift %>%
filter(library %in% c(2,4,6)) %>%
group_by(taxon, percIncorp, frac_mu, rep) %>%
summarize(median = median(median)) %>%
ungroup() %>%
rename('median_true_BD_shift' = median)
# status
df_shift %>% nrow %>% print
df_shift %>% head(n=3)
In [20]:
incorp_files = list.files(path=workDir, pattern='OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt', full.names=TRUE, recursive=TRUE)
incorp_files %>% length %>% print
In [21]:
df_incorp = list()
for(F in incorp_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$percIncorp = FF[FFl-3]
tmp$frac_mu = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_incorp[[F]] = tmp
}
df_incorp = do.call(rbind, df_incorp)
rownames(df_incorp) = 1:nrow(df_incorp)
df_incorp %>% head(n=3) %>% print
In [22]:
# just incorporators
df_incorp = df_incorp %>%
filter(incorp == TRUE) %>%
dplyr::distinct(taxon, incorp, percIncorp, frac_mu, rep) %>%
rename('HWHRSIP_incorp' = incorp)
df_incorp %>% nrow %>% print
df_incorp %>% head(n=3) %>% print
In [23]:
atomX_files = list.files(path=workDir, pattern='*_qSIP_atom.txt', full.names=TRUE, recursive=TRUE)
atomX_files %>% length %>% print
In [24]:
df_atomX = list()
for(F in atomX_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$percIncorp = FF[FFl-3]
tmp$frac_mu = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_atomX[[F]] = tmp
}
df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3) %>% print
In [25]:
# table join
df_atomX %>% nrow %>% print
df.j = left_join(df_atomX, df_shift, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'frac_mu'='frac_mu',
'rep'='rep')) %>%
filter(!is.na(BD_diff)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess))
df.j %>% nrow %>% print
df.j %>% head(n=3) %>% print
In [26]:
df.j$true_incorporator %>% summary
In [27]:
df.j = left_join(df.j, df_incorp, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'frac_mu'='frac_mu',
'rep'='rep')) %>%
mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))
df.j %>% nrow %>% print
df.j %>% head(n=3) %>% print
In [28]:
# difference between true and estimated
## q-SIP incorporators
df.j.dis.qSIP = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric))
# plotting
options(repr.plot.width=6, repr.plot.height=3)
p_qSIP = ggplot(df.j.dis.qSIP, aes(percIncorp, mean_delta_excess,
color=frac_mu, group=frac_mu,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
geom_line() +
#facet_grid(true_incorporator ~ .) +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='13C atom % excess (truth)',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [29]:
# difference between true and estimated
## q-SIP incorporators
df.j.dis.qSIP = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric))
# plotting
options(repr.plot.width=6, repr.plot.height=2.5)
p_qSIP = ggplot(df.j.dis.qSIP, aes(percIncorp, mean_delta_excess,
color=frac_mu, group=frac_mu,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
geom_line() +
#facet_grid(true_incorporator ~ .) +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='13C atom % excess (truth)',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [30]:
# difference between true and estimated
## q-SIP incorporators
tmp1 = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric),
incorp_called = 'No filter')
## MW-HR-SIP incorporators
tmp2 = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric),
incorp_called = 'MW-HR-SIP filter')
# combining tables
df.j.dis.qSIP = rbind(tmp1, tmp2) %>%
mutate(incorp_called = factor(incorp_called, levels=c('No filter', 'MW-HR-SIP filter')))
# plotting
options(repr.plot.width=6, repr.plot.height=4)
p_qSIP = ggplot(df.j.dis.qSIP, aes(percIncorp, mean_delta_excess,
color=frac_mu, group=frac_mu,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
facet_grid(incorp_called ~ .) +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='13C atom % excess (truth)',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [31]:
dBD_files = list.files(path=workDir, pattern='*_dBD.txt', full.names=TRUE, recursive=TRUE)
dBD_files %>% length %>% print
In [32]:
df_dBD = list()
for(F in dBD_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$percIncorp = FF[FFl-3]
tmp$frac_mu = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_dBD[[F]] = tmp
}
df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3) %>% print
In [33]:
df.j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'frac_mu'='frac_mu',
'rep'='rep')) %>%
filter(!is.na(delta_BD)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = delta_BD / 0.036)
df.j %>% head(n=3)
In [34]:
df.j = left_join(df.j, df_incorp, c('taxon' = 'taxon',
'percIncorp'='percIncorp',
'frac_mu'='frac_mu',
'rep'='rep')) %>%
mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))
df.j %>% nrow %>% print
df.j %>% head(n=3)
In [35]:
# difference between true and estimated
tmp1 = df.j %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric),
incorp_called = 'No filter')
tmp2 = df.j %>%
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(percIncorp, frac_mu) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
frac_mu = frac_mu %>% reorder(frac_mu %>% as.numeric),
incorp_called = 'MW-HR-SIP filter')
# combining tables
df.j.dis.dBD = rbind(tmp1, tmp2) %>%
mutate(incorp_called = factor(incorp_called, levels=c('No filter', 'MW-HR-SIP filter')))
# plotting
options(repr.plot.width=8, repr.plot.height=4)
p_dBD = ggplot(df.j.dis.dBD, aes(percIncorp, mean_delta_excess,
color=frac_mu, group=frac_mu,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
facet_grid(incorp_called ~ .) +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='13C atom % excess (truth)',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_dBD
In [36]:
df.jj = rbind(df.j.dis.qSIP %>% mutate(method='qSIP'),
df.j.dis.dBD %>% mutate(method='Delta BD')) %>%
mutate(method = gsub('qSIP', 'q-SIP', method))
p.comb = ggplot(df.jj, aes(percIncorp, mean_delta_excess,
color=frac_mu, group=frac_mu,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.5, size=1) +
geom_point() +
scale_color_discrete('Mean fraction\nsize (g ml^-1)') +
labs(x='atom % excess 13C (truth)',
y='atom % excess 13C\n(truth - estimate)') +
facet_grid(incorp_called ~ method) +
theme_bw()
p.comb
In [37]:
outF = file.path(workDir, 'atomIncorp_fracSize_BDshift.pdf')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')
In [38]:
outF = file.path(workDir, 'atomIncorp_fracSize_BDshift.jpeg')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')
In [ ]: