In [1]:
workDir = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/microBetaDiv/'
In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
as.Num = function(x) x %>% as.character %>% as.numeric
In [3]:
# classifying true positives, neg, ...
clsfy = function(guess,known){
if(is.na(guess) | is.na(known)){
return(NA)
}
if(guess == TRUE){
if(guess == known){
return('True positive')
} else {
return('False positive')
}
} else
if(guess == FALSE){
if(guess == known){
return('True negative')
} else {
return('False negative')
}
} else {
stop('Error: true or false needed')
}
}
In [8]:
BDshift_files = list.files(path=workDir, pattern='BD-shift_stats.txt', full.names=TRUE, recursive=TRUE)
BDshift_files %>% length %>% print
In [9]:
df_shift = list()
for(F in BDshift_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$shared_perc = FF[FFl-3]
tmp$perm_perc = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_shift[[F]] = tmp
}
df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)
df_shift = df_shift %>%
filter(library %in% c(2,4,6)) %>%
group_by(taxon, shared_perc, perm_perc, rep) %>%
summarize(median = median(median)) %>%
ungroup() %>%
rename('median_true_BD_shift' = median)
# status
df_shift %>% nrow %>% print
df_shift %>% head(n=3)
In [10]:
incorp_files = list.files(path=workDir, pattern='OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt', full.names=TRUE, recursive=TRUE)
incorp_files %>% length %>% print
In [11]:
df_incorp = list()
for(F in incorp_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$shared_perc = FF[FFl-3]
tmp$perm_perc = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_incorp[[F]] = tmp
}
df_incorp = do.call(rbind, df_incorp)
rownames(df_incorp) = 1:nrow(df_incorp)
df_incorp %>% head(n=3) %>% print
In [12]:
# just incorporators
df_incorp = df_incorp %>%
filter(incorp == TRUE) %>%
dplyr::distinct(taxon, incorp, shared_perc, perm_perc, rep) %>%
rename('HWHRSIP_incorp' = incorp)
df_incorp %>% nrow %>% print
df_incorp %>% head(n=3) %>% print
In [23]:
atomX_files = list.files(path=workDir, pattern='*_qSIP_atom.txt', full.names=TRUE, recursive=TRUE)
atomX_files %>% length %>% print
In [24]:
df_atomX = list()
for(F in atomX_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$shared_perc = FF[FFl-3]
tmp$perm_perc = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_atomX[[F]] = tmp
}
df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3) %>% print
In [25]:
# table join
df_atomX %>% nrow %>% print
df_atomX_j = left_join(df_atomX, df_shift, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
filter(!is.na(BD_diff)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess))
df_atomX_j %>% nrow %>% print
df_atomX_j %>% head(n=3) %>% print
In [27]:
df_atomX_j$true_incorporator %>% summary
In [28]:
df_atomX_j = left_join(df_atomX_j, df_incorp, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))
df_atomX_j %>% nrow %>% print
df_atomX_j %>% head(n=3) %>% print
In [29]:
dBD_files = list.files(path=workDir, pattern='*_dBD.txt', full.names=TRUE, recursive=TRUE)
dBD_files %>% length %>% print
In [30]:
df_dBD = list()
for(F in dBD_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$shared_perc = FF[FFl-3]
tmp$perm_perc = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_dBD[[F]] = tmp
}
df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3) %>% print
In [31]:
df_dBD_j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
filter(!is.na(delta_BD)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = delta_BD / 0.036)
df_dBD_j %>% head(n=3)
In [32]:
df_dBD_j = left_join(df_dBD_j, df_incorp, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))
df_dBD_j %>% nrow %>% print
df_dBD_j %>% head(n=3)
In [33]:
# q-SIP
df_atomX_j_true = df_atomX_j %>%
filter(true_incorporator == TRUE) %>% # just incorporators identified by q-SIP
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc, true_incorporator) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
method = 'q-SIP')
df_atomX_j_true %>% head(n=3)
In [34]:
# delta-BD
df_dBD_j_true = df_dBD_j %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
filter(true_incorporator == TRUE) %>%
group_by(shared_perc, perm_perc, true_incorporator) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
method = 'delta BD')
df_dBD_j_true %>% head(n=3)
In [35]:
# plotting
tmp = rbind(df_atomX_j_true, df_dBD_j_true)
options(repr.plot.width=8, repr.plot.height=2.5)
p_BDshift_true = ggplot(tmp, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
scale_color_discrete('% of rank\nabundances\npermuted') +
facet_grid(. ~ method) +
labs(x='% taxa shared among pre-fractionation communities',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
plot(p_BDshift_true)
In [38]:
# q-SIP
tmp_atomX = df_atomX_j %>%
filter(perm_perc == '10',
true_incorporator == TRUE,
atom_fraction_excess > 0) %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
atom_perc_excess = atom_fraction_excess * 100,
true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
dplyr::select(atom_perc_excess, true_atom_perc_excess, shared_perc, perm_perc) %>%
mutate(method = 'q-SIP')
# status
tmp_atomX %>% head(n=3)
In [40]:
# delta BD
tmp_dBD = df_dBD_j %>%
filter(perm_perc == '10',
true_incorporator == TRUE,
atom_fraction_excess > 0) %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
atom_perc_excess = atom_fraction_excess * 100,
true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
dplyr::select(atom_perc_excess, true_atom_perc_excess, shared_perc, perm_perc) %>%
mutate(method = 'delta BD')
# status
tmp_dBD %>% head(n=3)
In [45]:
# for vline
df_j = rbind(tmp_atomX, tmp_dBD)
tmp = df_j %>%
distinct(shared_perc, method, .keep_all=TRUE) %>%
mutate(true_atom_perc_excess = true_atom_perc_excess %>% as.character %>% as.numeric)
# plotting
options(repr.plot.width=10, repr.plot.height=3)
p_dens_true = ggplot(df_j, aes(atom_perc_excess)) +
geom_density(alpha=0.5, fill='grey70') +
geom_vline(data=tmp, aes(xintercept=true_atom_perc_excess), linetype='dashed', alpha=0.4) +
scale_x_continuous(limits=c(-20, 120)) +
labs(x='atom % excess 13C (estimate)', y='Probability density') +
facet_grid(method ~ shared_perc) +
theme_bw()
plot(p_dens_true)
In [ ]:
In [ ]:
In [94]:
# difference between true and estimated
## q-SIP incorporators
df.j.dis.qSIP = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric))
# plotting
options(repr.plot.width=6, repr.plot.height=3)
p_qSIP = ggplot(df.j.dis.qSIP, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
geom_line() +
#facet_grid(true_incorporator ~ .) +
scale_color_discrete('% incorp-\norators') +
labs(x='% taxa shared among pre-fractionation communities',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [95]:
# difference between true and estimated
## q-SIP incorporators
df.j.dis.qSIP = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric))
# plotting
options(repr.plot.width=6, repr.plot.height=2.5)
p_qSIP = ggplot(df.j.dis.qSIP, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
geom_line() +
#facet_grid(true_incorporator ~ .) +
scale_color_discrete('% of rank\nabundances\npermuted') +
labs(x='% taxa shared among pre-fractionation communities',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [96]:
# difference between true and estimated
## q-SIP incorporators
tmp1 = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
incorp_called = 'No filter')
## MW-HR-SIP incorporators
tmp2 = df.j %>%
filter(atom_CI_low > 0) %>% # just incorporators identified by q-SIP
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
incorp_called = 'MW-HR-SIP filter')
# combining tables
df.j.dis.qSIP = rbind(tmp1, tmp2) %>%
mutate(incorp_called = factor(incorp_called, levels=c('No filter', 'MW-HR-SIP filter')))
# plotting
options(repr.plot.width=6, repr.plot.height=4)
p_qSIP = ggplot(df.j.dis.qSIP, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
facet_grid(incorp_called ~ .) +
scale_color_discrete('% of rank\nabundances\npermuted') +
labs(x='% taxa shared among pre-fractionation communities',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_qSIP
In [97]:
dBD_files = list.files(path=workDir, pattern='*_dBD.txt', full.names=TRUE, recursive=TRUE)
dBD_files %>% length %>% print
In [98]:
df_dBD = list()
for(F in dBD_files){
tmp = read.delim(F, sep='\t')
FF = strsplit(F, '/') %>% unlist
FFl = length(FF)
tmp$shared_perc = FF[FFl-3]
tmp$perm_perc = FF[FFl-2]
tmp$rep = FF[FFl-1]
tmp$file = FF[FFl]
df_dBD[[F]] = tmp
}
df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3) %>% print
In [99]:
df.j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
filter(!is.na(delta_BD)) %>%
mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
true_atom_fraction_excess = median_true_BD_shift / 0.036,
atom_fraction_excess = delta_BD / 0.036)
df.j %>% head(n=3)
In [100]:
df.j = left_join(df.j, df_incorp, c('taxon' = 'taxon',
'shared_perc'='shared_perc',
'perm_perc'='perm_perc',
'rep'='rep')) %>%
mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))
df.j %>% nrow %>% print
df.j %>% head(n=3)
In [101]:
# difference between true and estimated
tmp1 = df.j %>%
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
incorp_called = 'No filter')
tmp2 = df.j %>%
filter(HWHRSIP_incorp == TRUE) %>% # just MW-HR-SIP incorporators
mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
group_by(shared_perc, perm_perc) %>%
summarize(mean_delta_excess = mean(delta_excess),
sd_delta_excess = sd(delta_excess)) %>%
ungroup() %>%
mutate(shared_perc = shared_perc %>% reorder(shared_perc %>% as.numeric),
perm_perc = perm_perc %>% reorder(perm_perc %>% as.numeric),
incorp_called = 'MW-HR-SIP filter')
# combining tables
df.j.dis.dBD = rbind(tmp1, tmp2) %>%
mutate(incorp_called = factor(incorp_called, levels=c('No filter', 'MW-HR-SIP filter')))
# plotting
options(repr.plot.width=8, repr.plot.height=4)
p_dBD = ggplot(df.j.dis.dBD, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.4, size=1) +
geom_point() +
facet_grid(incorp_called ~ .) +
scale_color_discrete('% of rank\nabundances\npermuted') +
labs(x='% taxa shared among pre-fractionation communities',
y='13C atom % excess\n(truth - estimate)') +
theme_bw()
p_dBD
In [102]:
df.jj = rbind(df.j.dis.qSIP %>% mutate(method='qSIP'),
df.j.dis.dBD %>% mutate(method='Delta BD')) %>%
mutate(method = gsub('qSIP', 'q-SIP', method))
p.comb = ggplot(df.jj, aes(shared_perc, mean_delta_excess,
color=perm_perc, group=perm_perc,
ymin=mean_delta_excess-sd_delta_excess,
ymax=mean_delta_excess+sd_delta_excess)) +
geom_line() +
geom_linerange(alpha=0.5, size=1) +
geom_point() +
scale_color_discrete('% of rank\nabundances\npermuted') +
labs(x='% taxa shared among pre-fractionation communities',
y='atom % excess 13C\n(truth - estimate)') +
facet_grid(incorp_called ~ method) +
theme_bw()
p.comb
In [103]:
outF = file.path(workDir, 'microBetaDiv_BDshift.pdf')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')
In [104]:
outF = file.path(workDir, 'microBetaDiv_BDshift.jpeg')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')
In [ ]: