In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/'
inFileDir = os.path.join(workDir, 'atomIncorp_taxaIncorp')
buildDir = os.path.join(workDir, 'atomIncorp_taxaIncorp_NMDS')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
In [2]:
import glob
from os.path import abspath
import nestly
from IPython.display import Image, display
In [3]:
%load_ext rpy2.ipython
In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)
library(vegan)
In [112]:
if not os.path.isdir(buildDir):
os.makedirs(buildDir)
In [113]:
%%R -i inFileDir
files = list.files(inFileDir, pattern='OTU_n2_abs1e9_sub-norm_filt.physeq', recursive=T)
dirs = gsub('/[0-9]+/[^/]+$','', files) %>% unique %>% as.data.frame
colnames(dirs) = c('dirs')
percTaxa.to.keep = c(1,10,25,50)
tbl = dirs %>%
separate(dirs, c('percIncorp','percTaxa'), sep='/', remove=F) %>%
filter(percTaxa %in% percTaxa.to.keep)
tbl %>% nrow %>% print
tbl %>% head
In [165]:
%%R -i inFileDir
get.files = function(dir, base.path, pattern){
dir = as.character(dir)
p = paste(c(base.path, dir), collapse='/')
files = list.files(p, pattern=pattern, recursive=T)
p = gsub('$', '/', p)
files = gsub('^', p, files)
return(files)
}
get.otu.tbls = function(files){
tbl.l = list()
for(f in files){
physeq = readRDS(f)
id = gsub('/[^/]+$', '', f)
id = gsub('.+atomIncorp_taxaIncorp/', '', id)
id = paste(c(id,'__'), collapse='')
otu.tbl = physeq %>% otu_table %>% as.data.frame
colnames(otu.tbl) = gsub('^', id, colnames(otu.tbl))
otu.tbl$taxon = rownames(otu.tbl)
tbl.l[[f]] = otu.tbl
}
return(tbl.l)
}
join.otu.tbls = function(tbl.l){
n = names(tbl.l)
tbl = tbl.l[[n[1]]]
for (i in seq(2:length(n))){
n.i = n[i]
tbl = left_join(tbl, tbl.l[[n.i]], c('taxon'='taxon'))
}
# edit table
tbl[is.na(tbl)] = 0
rownames(tbl) = tbl$taxon
tbl$taxon = NULL
# return
return(tbl)
}
make.meta = function(tbl, BD.range = seq(1.6, 1.9, 0.004)){
meta = tbl %>% colnames %>% as.data.frame
colnames(meta) = c('X')
meta = filter(meta, X != 'taxon')
# midpoint function
mid = function(x, y){ (x + y)/2 }
# making fixed BD-range
#BD.range = seq(1.6, 1.9, 0.004)
#print(meta); break
# edit metadata
meta = meta %>%
#separate(X, c('percIncorp','percTaxa','rep','library','BD_range'), sep='__', remove=F) %>%
separate(X, c('tmp1', 'library', 'tmp2'), sep='__', remove=F) %>%
separate(tmp1, c('percIncorp','percTaxa', 'rep'), sep='/') %>%
mutate(tmp2 = gsub('.[xy]$', '', tmp2)) %>%
separate(tmp2, c('BD.min', 'BD.max'), sep='-') %>%
mutate(BD.min = as.numeric(BD.min),
BD.max = as.numeric(BD.max),
BD.mid = mapply(mid, BD.min, BD.max),
BD.bin = cut(BD.mid, breaks=BD.range)) %>%
unite(groups, library, BD.bin, sep='__')
return(meta)
}
edit.ellipse = function(elps){
# midpoint function
mid = function(x, y){ (x + y)/2 }
elps.tbl = elps %>% summary %>% t %>% as.data.frame
elps.tbl$meta = rownames(elps.tbl)
elps.tbl = elps.tbl %>%
separate(meta, c('library','BD.range'), sep='__') %>%
mutate(BD.range = gsub('^.', '', BD.range),
BD.range = gsub('.$', '', BD.range)) %>%
separate(BD.range, c('BD.min','BD.max'), sep=',', remove=F) %>%
mutate(
BD.min = as.numeric(BD.min),
BD.max = as.numeric(BD.max),
BD.mid = mapply(mid, BD.min, BD.max))
}
In [166]:
%%R
# wrapper function
get.ellipses = function(x){
dir = x['dirs']
# test run of functions
files = get.files(dir, base.path=inFileDir, pattern='OTU_n2_abs1e9_sub-norm_filt.physeq')
otu.tbls = get.otu.tbls(files)
otu.tbl = join.otu.tbls(otu.tbls)
# making metadata
meta = make.meta(otu.tbl, BD.range=seq(1.6, 2, 0.002))
ord = metaMDS(otu.tbl %>% t)
# getting ellipse
plot(ord, type = "p", display='sites')
elps = ordiellipse(ord, groups=meta$groups, kind="se", conf=0.95, lwd=2, col="blue")
# edit ellipse file
elps.tbl = edit.ellipse(elps)
elps.tbl$percIncorp = x['percIncorp'] %>% as.vector
elps.tbl$percTaxa = x['percTaxa'] %>% as.vector
return(elps.tbl)
}
In [167]:
%%R
elps.tbls = apply(tbl, 1, get.ellipses)
In [168]:
%%R
tbl.c = do.call(rbind, elps.tbls)
tbl.c$percIncorp = factor(tbl.c$percIncorp, levels=sort(unique(as.numeric(tbl.c$percIncorp))))
tbl.c$percTaxa = factor(tbl.c$percTaxa, levels=sort(unique(as.numeric(tbl.c$percTaxa))))
tbl.c %>% head
In [169]:
%%R -w 1000 -h 800
ggplot(tbl.c, aes(NMDS1, NMDS2, color=library, size=BD.mid)) +
geom_point(alpha=0.5) +
geom_point(color='black', shape='O', alpha=0.6) +
scale_size_continuous(range=c(2,6)) +
facet_grid(percTaxa ~ percIncorp, scales='free') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [170]:
%%R -w 1000 -h 300
tbl.c.f = tbl.c %>%
filter(percIncorp == 100)
ggplot(tbl.c.f, aes(NMDS1, NMDS2, color=library, size=BD.mid)) +
geom_point(alpha=0.5) +
geom_point(color='black', shape='O', alpha=0.6) +
scale_size_continuous(range=c(2,6)) +
facet_grid(. ~ percTaxa, scales='free') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [164]:
%%R -w 1000 -h 250
tbl.c.f = tbl.c %>%
filter(percIncorp == 100)
percTaxa.u = tbl.c.f$percTaxa %>% unique %>% sort
make_plot = function(tbl, percTaxa.cut, title=''){
tbl.f = filter(tbl, percTaxa == percTaxa.cut)
p = ggplot(tbl.f, aes(NMDS1, NMDS2, color=library, size=BD.mid)) +
geom_point(alpha=0.5) +
geom_point(color='black', shape='O', alpha=0.6) +
scale_size_continuous(range=c(2,6)) +
labs(title=title) +
theme_bw() +
theme(
text = element_text(size=16),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = 'none'
)
return(p)
}
p1 = make_plot(tbl.c.f, percTaxa.u[1], title=percTaxa.u[1])
p2 = make_plot(tbl.c.f, percTaxa.u[2], title=percTaxa.u[2])
p3 = make_plot(tbl.c.f, percTaxa.u[3], title=percTaxa.u[3])
p4 = make_plot(tbl.c.f, percTaxa.u[4], title=percTaxa.u[4])
grid.arrange(p1,p2,p3,p4,nrow=1)
In [ ]:
In [52]:
%%R
# test run of functions
files = get.files(tbl[5,'dirs'], base.path=inFileDir, pattern='OTU_n2_abs1e9_sub-norm_filt.physeq')
otu.tbls = get.otu.tbls(files)
otu.tbl = join.otu.tbls(otu.tbls)
# making metadata
meta = make.meta(otu.tbl)
ord = metaMDS(otu.tbl %>% t)
# getting ellipse
plot(ord, type = "p", display='sites')
elps = ordiellipse(ord, groups=meta$groups, kind="se", conf=0.95, lwd=2, col="blue")
# edit ellipse file
elps.tbl = edit.ellipse(elps)
In [44]:
%%R
# ordination of combined OTU table
ord = metaMDS(otu.tbl %>% t)
In [45]:
%%R
plot(ord, type = "p", display='sites')
elps = ordiellipse(ord, groups=meta$groups, kind="se", conf=0.95, lwd=2, col="blue")
In [46]:
%%R
# midpoint function
mid = function(x, y){ (x + y)/2 }
elps.tbl = elps %>% summary %>% t %>% as.data.frame
elps.tbl$meta = rownames(elps.tbl)
elps.tbl = elps.tbl %>%
separate(meta, c('library','BD.range'), sep='__') %>%
mutate(BD.range = gsub('^.', '', BD.range),
BD.range = gsub('.$', '', BD.range)) %>%
separate(BD.range, c('BD.min','BD.max'), sep=',', remove=F) %>%
mutate(
BD.min = as.numeric(BD.min),
BD.max = as.numeric(BD.max),
BD.mid = mapply(mid, BD.min, BD.max))
elps.tbl %>% head
In [47]:
%%R -h 400
ggplot(elps.tbl, aes(NMDS1, NMDS2, color=library, size=BD.mid)) +
geom_point() +
scale_size_continuous(range=c(2,7)) +
theme_bw() +
theme(
text = element_text(size=16)
)
In [ ]:
In [22]:
%%R -i inFileDir
files = list.files(inFileDir, pattern='OTU_n2_abs1e9_sub-norm_filt.physeq', recursive=T) %>% as.data.frame
colnames(files) = c('files')
files %>% head
In [24]:
%%R
# file metadata
percTaxa.to.keep = c(1,10,25,50)
tbl = files %>%
separate(files, c('percIncorp','percTaxa','rep','file'), sep='/', remove=F) %>%
filter(percIncorp == 100, percTaxa %in% percTaxa.to.keep)
tbl %>% nrow %>% print
tbl %>% head
In [25]:
%%R -i workDir
tbl.l = list()
for(i in tbl$files){
p = paste(c(workDir, 'atomIncorp_taxaIncorp', i), collapse='/')
ord = readRDS(p)
id = gsub('/[^/]+$', '', i)
id = gsub('/', '__', id)
id = paste(c(id,'__'), collapse='')
otu.tbl = ord %>% otu_table %>% as.data.frame
colnames(otu.tbl) = gsub('^', id, colnames(otu.tbl))
otu.tbl$taxon = rownames(otu.tbl)
tbl.l[[i]] = otu.tbl
# otu.tbl %>% head %>% print; break
#samp.data = tbl.l[[i]] %>% sample_data %>% as.data.frame
#samp.data$file = i
#sample_data(tbl.l[[i]]) = samp.data
#ord = ordinate(physeq, method='NMDS', distance='bray')
#tbl.l[[i]] = ord
}
exmp = tbl.l[[names(tbl.l)[1]]]
In [10]:
%%R
# joining all OTU tables
n = names(tbl.l)
tbl = tbl.l[[n[1]]]
for (i in seq(2:length(n))){
n.i = n[i]
tbl = left_join(tbl, tbl.l[[n.i]], c('taxon'='taxon'))
}
# edit table
tbl[is.na(tbl)] = 0
rownames(tbl) = tbl$taxon
tbl$taxon = NULL
In [11]:
%%R
tbl %>% ncol %>% print
tbl %>% nrow %>% print
tbl[1:5,1:5]
In [12]:
%%R
# creating metadata table
meta = tbl %>% colnames %>% as.data.frame
colnames(meta) = c('X')
meta = filter(meta, X != 'taxon')
# midpoint function
mid = function(x, y){ (x + y)/2 }
# making fixed BD-range
BD.range = seq(1.6, 1.9, 0.004)
# edit metadata
meta = meta %>%
separate(X, c('percIncorp','percTaxa','rep','library','BD_range'), sep='__', remove=F) %>%
mutate(BD_range = gsub('.x$', '', BD_range)) %>%
separate(BD_range, c('BD.min', 'BD.max'), sep='-') %>%
mutate(BD.min = as.numeric(BD.min),
BD.max = as.numeric(BD.max),
BD.mid = mapply(mid, BD.min, BD.max),
BD.bin = cut(BD.mid, breaks=BD.range))
meta %>% head
In [14]:
%%R
# test table
tbl.test = tbl[,1:100]
In [15]:
%%R
# ordination of combined OTU table
ord = metaMDS(tbl.test %>% t)
## Start from previous best solution
ord = metaMDS(tbl.test %>% t, previous.best = ord)
In [16]:
%%R
plot(ord, type = "p", display='sites')
elps = ordiellipse(ord, groups=meta$BD.bin, kind="se", conf=0.95, lwd=2, col="blue")
In [17]:
%%R
elps = elps %>% summary %>% t %>% as.data.frame
elps
In [18]:
%%R
ggplot(elps, aes(NMDS1, NMDS2)) +
geom_point()
In [40]:
%%R
# getting list of BD-value tables
# making fixed BD-range
BD.range = seq(1.6, 1.9, 0.004)
# midpoint function
mid = function(x, y){ (x + y)/2 }
# defining BD range & binning
get.BD.range = function(ord, BD.range){
tbl = ord %>% scores %>% as.data.frame
# metadata from rowname
tbl$lib = gsub('__.+', '', rownames(tbl)) %>% as.character
tbl$BD.start = gsub('.+__([0-9.]+)-.+', '\\1', rownames(tbl)) %>% as.numeric
tbl$BD.end = gsub('.+-', '', rownames(tbl)) %>% as.numeric
tbl$BD.mid = mapply(mid, tbl$BD.start, tbl$BD.end)
# binning BD.mid
tbl$BD.bin = cut(tbl$BD.mid, breaks=BD.range)
tbl = unite(tbl, group, lib, BD.bin, sep='__')
return(tbl)
}
tbl.l.BD = lapply(tbl.l, get.BD.range, BD.range=BD.range)
In [41]:
%%R
# combining lists
tbl.l.c = list()
for(f in names(tbl.l)){
tbl.l.c[[f]][['ord']] = tbl.l[[f]]
tbl.l.c[[f]][['meta']] = tbl.l.BD[[f]]
}
In [43]:
%%R
get.ellipse = function(x){
meta = x$meta
ord = x$ord
plot(ord, type = "p", display='sites')
elps = ordiellipse(ord, meta$group, kind="se", conf=0.95, lwd=2, col="blue")
elps = elps %>% summary %>% t %>% as.data.frame
return(elps)
}
tbl.elps = lapply(tbl.l.c, get.ellipse)
In [ ]:
In [39]:
%%R
tbl = do.call(rbind, tbl.elps)
tbl %>% head
In [ ]: