In [138]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'
nprocs = 3
In [139]:
import os
import numpy as np
import dill
import pandas as pd
%load_ext rpy2.ipython
In [140]:
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)
In [141]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [142]:
# max 13C shift
max_13C_shift_in_BD = 0.036
# min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
# max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66 # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
## BD range of values
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [ ]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
In [ ]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
In [ ]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
In [ ]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [45]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[45]:
In [46]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[46]:
In [47]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[47]:
In [48]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [49]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [50]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[50]:
In [51]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[51]:
In [52]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[52]:
In [53]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [54]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [55]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[55]:
In [56]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[56]:
In [57]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[57]:
In [58]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [59]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [60]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL_fa1e-4.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[60]:
In [61]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
pdf[k] = v.evaluate(BD_vals)
pdf.keys()
Out[61]:
In [62]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[62]:
In [63]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [131]:
BD_vals = np.arange(min_BD, max_BD, 0.001)
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL-comm.pkl')
with open(F, 'rb') as inFH:
kde = dill.load(inFH)
kde
Out[131]:
In [132]:
# probability at each location in gradient
pdf = {}
for libID,v in kde.items():
for taxon,k in v.items():
pdf[taxon] = k.evaluate(BD_vals)
pdf.keys()
Out[132]:
In [133]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)
Out[133]:
In [134]:
%%R -i df -w 800 -h 350
df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
gather(taxon_name, P, 1:3) %>%
mutate(BD = as.numeric(BD),
P = as.numeric(P),
taxon_name = as.character(taxon_name)) %>%
filter(P > 1e-9)
p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p2 = p1 + scale_y_log10()
grid.arrange(p1, p2, ncol=2)
In [135]:
%%R
df.g %>%
group_by(taxon_name) %>%
summarize(max_P = max(P),
min_P = min(P)) %>% print
In [136]:
%%R -i workDir
F = file.path(workDir, 'comm.txt')
df.comm = read.delim(F, sep='\t') %>%
mutate(rel_abund = rel_abund_perc / 100)
df.comm %>% print
df.g.s = df.g %>%
filter(BD > 1.75) %>%
group_by(BD) %>%
mutate(P_rel_abund = P / sum(P)) %>%
group_by(taxon_name) %>%
summarize(mean_P = mean(P))
df.g.s = inner_join(df.g.s, df.comm, c('taxon_name' = 'taxon_name'))
df.g.s %>% print
ggplot(df.g.s, aes(rel_abund, mean_P)) +
geom_point() +
geom_line()
In [ ]:
In [ ]:
In [ ]: