In [14]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/'
frag_info_file = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde_info.txt'
In [2]:
import os
import glob
import itertools
import nestly
In [3]:
%load_ext rpy2.ipython
%load_ext pushnote
In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [5]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036
min_BD = min_GC/100.0 * 0.098 + 1.66
max_BD = max_GC/100.0 * 0.098 + 1.66
max_BD = max_BD + max_13C_shift_in_BD
print 'Min BD: {}'.format(min_BD)
print 'Max BD: {}'.format(max_BD)
In [6]:
F = os.path.join(workDir, '*', '*', '*', 'comm.txt')
files = glob.glob(F)
print len(files)
In [7]:
%%R -i files
df_comm = list()
for (f in files){
df.tmp = read.delim(f, sep='\t')
ff = strsplit(f, '/') %>% unlist
df.tmp$percIncorp = ff[9]
df.tmp$percTaxa = ff[10]
df.tmp$sim_rep = ff[11]
f_name = ff[12]
df_comm[[f]] = df.tmp
}
df_comm = do.call(rbind, df_comm)
rownames(df_comm) = 1:nrow(df_comm)
df_comm %>% head(n=3)
In [96]:
F = os.path.join(workDir, '*', '*', '*', '*_data.txt')
files = glob.glob(F)
print len(files)
In [97]:
%%R -i files
cols = c('library', 'taxon', 'min', 'q25', 'mean', 'median', 'q75', 'max', 'incorp.known', 'incorp.pred')
df_data = list()
for (f in files){
df.tmp = read.delim(f, sep='\t')
df.tmp = df.tmp[,cols]
ff = strsplit(f, '/') %>% unlist
df.tmp$percIncorp = ff[9]
df.tmp$percTaxa = ff[10]
df.tmp$sim_rep = ff[11]
df.tmp$method = gsub('-cMtx_data.txt', '', ff[12])
f_name = ff[12]
df_data[[f]] = df.tmp
}
df_data = do.call(rbind, df_data)
rownames(df_data) = 1:nrow(df_data)
df_data %>% head(n=3)
In [98]:
%%R -i frag_info_file
df_info = read.delim(frag_info_file, sep='\t')
df_info %>% head(n=3)
In [99]:
%%R
clsfy = function(guess,known){
if(is.na(guess) | is.na(known)){
return(NA)
}
if(guess == TRUE){
if(guess == known){
return('True positive')
} else {
return('False positive')
}
} else
if(guess == FALSE){
if(guess == known){
return('True negative')
} else {
return('False negative')
}
} else {
stop('Error: true or false needed')
}
}
In [100]:
%%R
# comm & classificatino
join.on = c(
'library' = 'library',
'taxon_name' = 'taxon',
'percIncorp' = 'percIncorp',
'percTaxa' = 'percTaxa',
'sim_rep' = 'sim_rep')
df.j = inner_join(df_comm, df_data, join.on) %>%
filter(library %in% c(2,4,6)) %>%
mutate(cls = mapply(clsfy, incorp.pred, incorp.known))
# frag info
df.j = inner_join(df.j, df_info, c('taxon_name'='taxon_ID'))
df.j %>% head(n=3)
In [101]:
%%R
# renaming method
rename = data.frame(method = c('DESeq2', 'DESeq2_multi', 'heavy', 'qSIP'),
method_new = c('HR-SIP', 'HR-SIP_multi', 'Heavy-SIP', 'qSIP'))
df.j = inner_join(df.j, rename, c('method'='method')) %>%
select(-method) %>%
rename('method' = method_new)
# reorder
as.Num = function(x) x %>% as.character %>% as.numeric
df.j$percTaxa = reorder(df.j$percTaxa, df.j$percTaxa %>% as.Num)
df.j$percIncorp = reorder(df.j$percIncorp, df.j$percIncorp %>% as.Num)
df.j %>% head(n=3)
In [102]:
%%R -w 800 -h 600
df.j.f = df.j %>%
filter(KDE_ID == 1,
cls != 'True negative')
ggplot(df.j.f, aes(cls, rel_abund_perc, fill=method)) +
geom_boxplot() +
facet_grid(percTaxa ~ percIncorp) +
scale_y_log10() +
labs(y='Pre-fractionation\nrelative abundance (%)') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=45, hjust=1),
axis.title.x = element_blank()
)
In [103]:
%%R -w 800 -h 600
ggplot(df.j.f, aes(cls, median.y, fill=method)) +
geom_boxplot() +
facet_grid(percTaxa ~ percIncorp) +
labs(y='Median fragment BD (g ml^-1)') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=45, hjust=1),
axis.title.x = element_blank()
)
In [104]:
%%R -w 800 -h 600
BD2GC = function(p){
(p - 1.66) / 0.098 * 100
}
df.j.f = df.j.f %>%
mutate(median_GC = sapply(median.y, BD2GC))
ggplot(df.j.f, aes(cls, median_GC, fill=method)) +
geom_boxplot() +
facet_grid(percTaxa ~ percIncorp) +
labs(y='Median DNA fragment G+C') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=45, hjust=1),
axis.title.x = element_blank()
)
In [105]:
%pushnote atomIncorp_taxaIncorp_acc-factors complete
In [ ]: