In [65]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'
nprocs = 3
In [66]:
import os
import numpy as np
import dill
import pandas as pd
%load_ext rpy2.ipython
In [67]:
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)
In [68]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [69]:
# max 13C shift
max_13C_shift_in_BD = 0.036
# min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
# max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66 # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
## BD range of values
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [70]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[70]:
In [71]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[71]:
In [72]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[72]:
In [73]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-15)
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
p1.skn = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
labs(x=x.lab, y='Probability density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2.skn = p1.skn + scale_y_log10()
grid.arrange(p1.skn, p2.skn, ncol=2)
In [74]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[74]:
In [75]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[75]:
In [76]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[76]:
In [77]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [78]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [79]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[79]:
In [80]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
for kk,vv in v.items():
pdf[kk] = vv.evaluate(BD_vals)
pdf.keys()
Out[80]:
In [81]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[81]:
In [82]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-15)
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
p1.skn.dbl = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
labs(x=x.lab, y='Probability density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2.skn.dbl = p1.skn.dbl + scale_y_log10()
grid.arrange(p1.skn.dbl, p2.skn.dbl, ncol=2)
In [84]:
%%R -w 800 -h 300
# plot formatting
title.size=16
p2.skn.f = p2.skn +
ggtitle('Gaussian BD') +
theme(
plot.title = element_text(size=title.size)
)
p2.skn.dbl.f = p2.skn.dbl +
ggtitle('Gaussian BD + DBL') +
theme(
plot.title = element_text(size=title.size)
)
# combined plot
#p.comb = cowplot::plot_grid(p2.skn.f, p2.skn.dbl.f, labels=c('A)', 'B)'), align='h')
p.comb = cowplot::ggdraw() +
geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
cowplot::draw_plot(p2.skn.f, 0.01, 0.01, 0.49, 0.99) +
cowplot::draw_plot(p2.skn.dbl.f, 0.5, 0.01, 0.49, 0.99) +
cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb
In [85]:
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf')
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')
In [102]:
%%R -w 800 -h 300
p1.skn.e = p1.skn +
scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.e = p2.skn +
scale_x_continuous(limits=c(1.675, 1.775)) +
scale_y_log10(limits=c(1e-12, 150))
p1.skn.dbl.e = p1.skn.dbl +
scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.dbl.e = p2.skn.dbl +
scale_x_continuous(limits=c(1.675, 1.775)) +
scale_y_log10(limits=c(1e-12, 150))
p.comb = cowplot::ggdraw() +
geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
cowplot::draw_plot(p2.skn.e, 0.01, 0.01, 0.49, 0.99) +
cowplot::draw_plot(p2.skn.dbl.e, 0.5, 0.01, 0.49, 0.99) +
cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb
In [103]:
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf')
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')
In [54]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [55]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[55]:
In [56]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[56]:
In [57]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[57]:
In [58]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [59]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [60]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL_fa1e-4.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[60]:
In [61]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[61]:
In [62]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[62]:
In [63]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [131]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL-comm.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[131]:
In [132]:
# probability at each location in gradient
pdf = {}
for libID,v in kde.items():
for taxon,k in v.items():
pdf[taxon] = k.evaluate(BD_vals)
pdf.keys()
Out[132]:
In [133]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[133]:
In [134]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [135]:
%%R
df.g %>%
group_by(taxon_name) %>%
summarize(max_P = max(P),
min_P = min(P)) %>% print
In [136]:
%%R -i workDir
F = file.path(workDir, 'comm.txt')
df.comm = read.delim(F, sep='\t') %>%
mutate(rel_abund = rel_abund_perc / 100)
df.comm %>% print
df.g.s = df.g %>%
filter(BD > 1.75) %>%
group_by(BD) %>%
mutate(P_rel_abund = P / sum(P)) %>%
group_by(taxon_name) %>%
summarize(mean_P = mean(P))
df.g.s = inner_join(df.g.s, df.comm, c('taxon_name' = 'taxon_name'))
df.g.s %>% print
ggplot(df.g.s, aes(rel_abund, mean_P)) +
geom_point() +
geom_line()
In [ ]:
In [ ]:
In [ ]: