In [248]:
%load_ext rpy2.ipython
In [249]:
%%R
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
physeqDir = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/'
physeqBulkCore = 'bulk-core_trm'
physeqSIP = 'SIP-core_unk_trm'
In [286]:
ampFragFile = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde.pkl'
In [174]:
import os
In [250]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(phyloseq)
library(fitdistrplus)
library(sads)
In [251]:
%%R
dir.create(workDir, showWarnings=FALSE)
In [252]:
%%R
# bulk core samples
F = file.path(physeqDir, physeqBulkCore)
physeq.bulk = readRDS(F)
#physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk %>% names
In [253]:
%%R
# SIP core samples
F = file.path(physeqDir, physeqSIP)
physeq.SIP = readRDS(F)
#physeq.SIP.m = physeq.SIP %>% sample_data
physeq.SIP %>% names
In [29]:
%%R
physeq2otu.long = function(physeq){
df.OTU = physeq %>%
transform_sample_counts(function(x) x/sum(x)) %>%
otu_table %>%
as.matrix %>%
as.data.frame
df.OTU$OTU = rownames(df.OTU)
df.OTU = df.OTU %>%
gather('sample', 'abundance', 1:(ncol(df.OTU)-1))
return(df.OTU)
}
df.OTU.l = lapply(physeq.bulk, physeq2otu.long)
df.OTU.l %>% names
#df.OTU = do.call(rbind, lapply(physeq.bulk, physeq2otu.long))
#df.OTU$Day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', df.OTU$sample)
#df.OTU %>% head(n=3)
In [31]:
%%R -w 450 -h 400
lapply(df.OTU.l, function(x) descdist(x$abundance, boot=1000))
In [48]:
%%R
fitdists = function(x){
fit.l = list()
#fit.l[['norm']] = fitdist(x$abundance, 'norm')
fit.l[['exp']] = fitdist(x$abundance, 'exp')
fit.l[['logn']] = fitdist(x$abundance, 'lnorm')
fit.l[['gamma']] = fitdist(x$abundance, 'gamma')
fit.l[['beta']] = fitdist(x$abundance, 'beta')
# plotting
plot.legend = c('exponential', 'lognormal', 'gamma', 'beta')
par(mfrow = c(2,1))
denscomp(fit.l, legendtext=plot.legend)
qqcomp(fit.l, legendtext=plot.legend)
# fit summary
gofstat(fit.l, fitnames=plot.legend) %>% print
return(fit.l)
}
fits.l = lapply(df.OTU.l, fitdists)
fits.l %>% names
In [50]:
%%R
# getting summaries for lognormal fits
get.summary = function(x, id='logn'){
summary(x[[id]])
}
fits.s = lapply(fits.l, get.summary)
fits.s %>% names
In [74]:
%%R
# listing estimates for fits
df.fits = do.call(rbind, lapply(fits.s, function(x) x$estimate)) %>% as.data.frame
df.fits$Sample = rownames(df.fits)
df.fits$Day = gsub('.+D([0-9]+)\\.R.+', '\\1', df.fits$Sample) %>% as.numeric
df.fits
In [78]:
%%R -w 650 -h 300
ggplot(df.fits, aes(Day, meanlog,
ymin=meanlog-sdlog,
ymax=meanlog+sdlog)) +
geom_pointrange() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [60]:
%%R
# mean of estimaates
apply(df.fits, 2, mean)
In [102]:
%%R -w 800
df.OTU = do.call(rbind, df.OTU.l) %>%
mutate(abundance = abundance * 100) %>%
group_by(sample) %>%
mutate(rank = row_number(desc(abundance))) %>%
ungroup() %>%
filter(rank < 10)
ggplot(df.OTU, aes(rank, abundance, color=sample, group=sample)) +
geom_point() +
geom_line() +
labs(y = '% rel abund')
In [272]:
%%R -w 800 -h 300
df.OTU = do.call(rbind, df.OTU.l) %>%
mutate(abundance = abundance * 100) %>%
group_by(sample) %>%
mutate(rank = row_number(desc(abundance))) %>%
group_by(rank) %>%
summarize(mean_abundance = mean(abundance)) %>%
ungroup() %>%
mutate(library = 1,
mean_abundance = mean_abundance / sum(mean_abundance) * 100) %>%
rename('rel_abund_perc' = mean_abundance) %>%
dplyr::select(library, rel_abund_perc, rank) %>%
as.data.frame
df.OTU %>% nrow %>% print
ggplot(df.OTU, aes(rank, rel_abund_perc)) +
geom_point() +
geom_line() +
labs(y = 'mean % rel abund')
In [273]:
ret = !SIPSim KDE_info -t /home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde.pkl
ret = ret[1:]
ret[:5]
Out[273]:
In [274]:
%%R
F = '/home/nick/notebook/SIPSim/dev/fullCyc_trim//ampFrags_kde_amplified.txt'
ret = read.delim(F, sep='\t')
ret = ret$genomeID
ret %>% length %>% print
ret %>% head
In [275]:
%%R
ret %>% length %>% print
df.OTU %>% nrow
In [276]:
%%R -i ret
# randomize
ret = ret %>% sample %>% sample %>% sample
# adding to table
df.OTU$taxon_name = ret[1:nrow(df.OTU)]
df.OTU = df.OTU %>%
dplyr::select(library, taxon_name, rel_abund_perc, rank)
df.OTU %>% head
In [245]:
%%R
#-- debug -- #
df.gc = read.delim('~/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_parsed_kde_info.txt',
sep='\t', row.names=)
top.taxa = df.gc %>%
filter(KDE_ID == 1, median > 1.709, median < 1.711) %>%
dplyr::select(taxon_ID) %>%
mutate(taxon_ID = taxon_ID %>% sample) %>%
head
top.taxa = top.taxa$taxon_ID %>% as.vector
top.taxa
In [246]:
%%R
#-- debug -- #
p1 = df.OTU %>%
filter(taxon_name %in% top.taxa)
p2 = df.OTU %>%
head(n=length(top.taxa))
p3 = anti_join(df.OTU, rbind(p1, p2), c('taxon_name' = 'taxon_name'))
df.OTU %>% nrow %>% print
p1 %>% nrow %>% print
p2 %>% nrow %>% print
p3 %>% nrow %>% print
p1 = p2$taxon_name
p2$taxon_name = top.taxa
df.OTU = rbind(p2, p1, p3)
df.OTU %>% nrow %>% print
df.OTU %>% head
In [278]:
%%R
F = file.path(workDir, 'fullCyc_12C-Con_trm_comm.txt')
write.table(df.OTU, F, sep='\t', quote=FALSE, row.names=FALSE)
cat('File written:', F, '\n')
In [280]:
ampFragFile
Out[280]:
In [283]:
!ls -thlc
In [285]:
!tail -n +2 /home/nick/notebook/SIPSim/dev/fullCyc/fullCyc_12C-Con_trm_comm.txt | \
cut -f 2 > /home/nick/notebook/SIPSim/dev/fullCyc/fullCyc_12C-Con_trm_comm_taxa.txt
In [289]:
outFile = os.path.splitext(ampFragFile)[0] + '_parsed.pkl'
!SIPSim KDE_parse \
$ampFragFile \
/home/nick/notebook/SIPSim/dev/fullCyc/fullCyc_12C-Con_trm_comm_taxa.txt \
> $outFile
print 'File written {}'.format(outFile)
!SIPSim KDE_info -n $outFile
In [ ]: