In [2]:
baseDir = '/home/nick/notebook/SIPSim/dev/priming_exp/'
workDir = os.path.join(baseDir, 'exp_info')
otuTableFile = '/var/seq_data/priming_exp/data/otu_table.txt'
otuTableSumFile = '/var/seq_data/priming_exp/data/otu_table_summary.txt'
metaDataFile = '/var/seq_data/priming_exp/data/allsample_metadata_nomock.txt'
#otuRepFile = '/var/seq_data/priming_exp/otusn.pick.fasta'
#otuTaxFile = '/var/seq_data/priming_exp/otusn_tax/otusn_tax_assignments.txt'
#genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
In [3]:
import glob
In [4]:
%load_ext rpy2.ipython
In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(fitdistrplus)
In [6]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [46]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(ends_with('.NA'))
tbl %>% ncol %>% print
tbl[1:4,1:4]
In [47]:
%%R
tbl.h = tbl %>%
gather('sample', 'count', 1:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.h %>% head
In [51]:
%%R -w 900 -h 400
tbl.h.s = tbl.h %>%
group_by(sample) %>%
summarize(total_count = sum(count)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
ggplot(tbl.h.s, aes(day, total_count, color=rep %>% as.character)) +
geom_point() +
facet_grid(isotope ~ treatment) +
theme(
text = element_text(size=16)
)
In [69]:
%%R
tbl.h.s$sample[grepl('700', tbl.h.s$sample)] %>% as.vector %>% sort
In [10]:
%%R
# bulk soil samples for gradients to simulate
samples.to.use = c(
"X12C.700.14.05.NA",
"X12C.700.28.03.NA",
"X12C.700.45.01.NA",
"X13C.700.14.08.NA",
"X13C.700.28.06.NA",
"X13C.700.45.01.NA"
)
In [29]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(ends_with('.NA'))
tbl$OTUId = rownames(tbl)
tbl %>% ncol %>% print
tbl[1:4,1:4]
In [30]:
%%R
tbl.h = tbl %>%
gather('sample', 'count', 1:(ncol(tbl)-1)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.h %>% head
In [31]:
%%R -w 800
tbl.s = tbl.h %>%
filter(count > 0) %>%
group_by(sample, isotope, treatment, day, rep, fraction) %>%
summarize(n_taxa = n())
ggplot(tbl.s, aes(day, n_taxa, color=rep %>% as.character)) +
geom_point() +
facet_grid(isotope ~ treatment) +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_blank()
)
In [33]:
%%R -w 800 -h 350
# filter to just target samples
tbl.s.f = tbl.s %>% filter(sample %in% samples.to.use)
ggplot(tbl.s.f, aes(day, n_taxa, fill=rep %>% as.character)) +
geom_bar(stat='identity') +
facet_grid(. ~ isotope) +
labs(y = 'Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_blank()
)
In [82]:
%%R
message('Bulk soil total observed richness: ')
tbl.s.f %>% select(-fraction) %>% as.data.frame %>% print
In [49]:
%%R -i otuTableFile
# loading OTU table
tbl = read.delim(otuTableFile, sep='\t') %>%
select(-ends_with('.NA'))
tbl.h = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.h %>% head
In [56]:
%%R
# basename of fractions
samples.to.use.base = gsub('\\.[0-9]+\\.NA', '', samples.to.use)
samps = tbl.h$sample %>% unique
fracs = sapply(samples.to.use.base, function(x) grep(x, samps, value=TRUE))
for (n in names(fracs)){
n.frac = length(fracs[[n]])
cat(n, '-->', 'Number of fraction samples: ', n.frac, '\n')
}
In [79]:
%%R
# function for getting all OTUs in a sample
n.OTUs = function(samples, otu.long){
otu.long.f = otu.long %>%
filter(sample %in% samples,
count > 0)
n.OTUs = otu.long.f$OTUId %>% unique %>% length
return(n.OTUs)
}
num.OTUs = lapply(fracs, n.OTUs, otu.long=tbl.h)
num.OTUs = do.call(rbind, num.OTUs) %>% as.data.frame
colnames(num.OTUs) = c('n_taxa')
num.OTUs$sample = rownames(num.OTUs)
num.OTUs
In [75]:
%%R
tbl.s.f %>% as.data.frame
In [91]:
%%R
# joining with bulk soil sample summary table
num.OTUs$data = 'fractions'
tbl.s.f$data = 'bulk_soil'
tbl.j = rbind(num.OTUs,
tbl.s.f %>% ungroup %>% select(sample, n_taxa, data)) %>%
mutate(isotope = gsub('X|\\..+', '', sample),
sample = gsub('\\.[0-9]+\\.NA', '', sample))
tbl.j
In [95]:
%%R -h 300 -w 800
ggplot(tbl.j, aes(sample, n_taxa, fill=data)) +
geom_bar(stat='identity', position='dodge') +
facet_grid(. ~ isotope, scales='free_x') +
labs(y = 'Number of OTUs') +
theme(
text = element_text(size=16)
# axis.text.x = element_text(angle=90)
)
In [145]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(-ends_with('.NA'))
tbl %>% ncol %>% print
tbl[1:4,1:4]
In [146]:
%%R
tbl.h = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.h %>% head
In [147]:
%%R -h 400
tbl.h.s = tbl.h %>%
group_by(sample) %>%
summarize(total_seqs = sum(count))
p = ggplot(tbl.h.s, aes(total_seqs)) +
theme_bw() +
theme(
text = element_text(size=16)
)
p1 = p + geom_histogram(binwidth=200)
p2 = p + geom_density()
grid.arrange(p1,p2,ncol=1)
In [152]:
%%R -w 700 -h 350
plotdist(tbl.h.s$total_seqs)
In [153]:
%%R -w 450 -h 400
descdist(tbl.h.s$total_seqs, boot=1000)
In [154]:
%%R
f.n = fitdist(tbl.h.s$total_seqs, 'norm')
f.ln = fitdist(tbl.h.s$total_seqs, 'lnorm')
f.ll = fitdist(tbl.h.s$total_seqs, 'logis')
#f.c = fitdist(tbl.s$count, 'cauchy')
f.list = list(f.n, f.ln, f.ll)
plot.legend = c('normal', 'log-normal', 'logistic')
par(mfrow = c(2,1))
denscomp(f.list, legendtext=plot.legend)
qqcomp(f.list, legendtext=plot.legend)
In [155]:
%%R
gofstat(list(f.n, f.ln, f.ll), fitnames=plot.legend)
In [156]:
%%R
summary(f.ln)
In [201]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(-ends_with('.NA')) %>%
select(-starts_with('X0MC'))
tbl = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
mutate(sample = gsub('^X', '', sample))
tbl %>% head
In [202]:
%%R
# summarize
tbl.s = tbl %>%
group_by(sample) %>%
summarize(total_count = sum(count))
tbl.s %>% head(n=3)
In [203]:
%%R -i metaDataFile
tbl.meta = read.delim(metaDataFile, sep='\t')
tbl.meta %>% head(n=3)
In [186]:
%%R -w 700
tbl.j = inner_join(tbl.s, tbl.meta, c('sample' = 'Sample'))
ggplot(tbl.j, aes(Density, total_count, color=rep)) +
geom_point() +
facet_grid(Treatment ~ Day)
In [184]:
%%R -w 600 -h 350
ggplot(tbl.j, aes(Density, total_count)) +
geom_point(aes(color=Treatment)) +
geom_smooth(method='lm') +
labs(x='Buoyant density', y='Total sequences') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [208]:
%%R
tbl.s = tbl %>%
filter(count > 0) %>%
group_by(sample) %>%
summarize(n_taxa = sum(count > 0))
tbl.j = inner_join(tbl.s, tbl.meta, c('sample' = 'Sample'))
tbl.j %>% head(n=3)
In [217]:
%%R -w 900 -h 600
ggplot(tbl.j, aes(Density, n_taxa, fill=rep, color=rep)) +
#geom_area(stat='identity', alpha=0.5, position='dodge') +
geom_point() +
geom_line() +
labs(x='Buoyant density', y='Number of taxa') +
facet_grid(Treatment ~ Day) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [134]:
%%R -i otuTableFile
# loading OTU table
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(matches('OTUId'), ends_with('.NA'))
tbl %>% ncol %>% print
tbl[1:4,1:4]
In [135]:
%%R
# long table format w/ selecting samples of interest
tbl.h = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F) %>%
filter(sample %in% samples.to.use,
count > 0)
tbl.h %>% head
In [136]:
%%R
message('Number of samples: ', tbl.h$sample %>% unique %>% length)
message('Number of OTUs: ', tbl.h$OTUId %>% unique %>% length)
In [137]:
%%R
tbl.hs = tbl.h %>%
group_by(OTUId) %>%
summarize(
total_count = sum(count),
mean_count = mean(count),
median_count = median(count),
sd_count = sd(count)
) %>%
filter(total_count > 0)
tbl.hs %>% head
In [140]:
%%R -i workDir
setwd(workDir)
samps = tbl.h$sample %>% unique %>% as.vector
for(samp in samps){
outFile = paste(c(samp, 'OTU.txt'), collapse='_')
tbl.p = tbl.h %>%
filter(sample == samp, count > 0)
write.table(tbl.p, outFile, sep='\t', quote=F, row.names=F)
message('Table written: ', outFile)
message(' Number of OTUs: ', tbl.p %>% nrow)
}
In [133]:
p = os.path.join(workDir, '*_OTU.txt')
files = glob.glob(p)
baseDir = os.path.split(workDir)[0]
newDirs = [os.path.split(x)[1].rstrip('.NA_OTU.txt') for x in files]
newDirs = [os.path.join(baseDir, x) for x in newDirs]
for newDir,f in zip(newDirs, files):
if not os.path.isdir(newDir):
print 'Making new directory: {}'.format(newDir)
os.makedirs(newDir)
else:
print 'Directory exists: {}'.format(newDir)
# symlinking file
linkPath = os.path.join(newDir, os.path.split(f)[1])
if not os.path.islink(linkPath):
os.symlink(f, linkPath)
In [7]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(matches('OTUId'), ends_with('.NA'))
tbl %>% ncol %>% print
tbl[1:4,1:4]
In [9]:
%%R
# long table format w/ selecting samples of interest
tbl.h = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F) %>%
filter(sample %in% samples.to.use,
count > 0)
tbl.h %>% head
In [25]:
%%R
# ranks of relative abundances
tbl.r = tbl.h %>%
group_by(sample) %>%
mutate(perc_rel_abund = count / sum(count) * 100,
rank = row_number(-perc_rel_abund)) %>%
unite(day_rep, day, rep, sep='-')
tbl.r %>% as.data.frame %>% head(n=3)
In [27]:
%%R -w 900 -h 350
ggplot(tbl.r, aes(rank, perc_rel_abund)) +
geom_point() +
# labs(x='Buoyant density', y='Number of taxa') +
facet_wrap(~ day_rep) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [22]:
%%R -i otuTableFile
tbl = read.delim(otuTableFile, sep='\t')
# filter
tbl = tbl %>%
select(-ends_with('.NA')) %>%
select(-starts_with('X0MC'))
tbl = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
mutate(sample = gsub('^X', '', sample))
tbl %>% head
In [43]:
%%R
tbl.ar = tbl %>%
#mutate(fraction = gsub('.+\\.', '', sample) %>% as.numeric) %>%
#mutate(treatment = gsub('(.+)\\..+', '\\1', sample)) %>%
group_by(sample) %>%
mutate(rel_abund = count / sum(count)) %>%
summarize(abund_range = max(rel_abund) - min(rel_abund)) %>%
ungroup() %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.ar %>% head(n=3)
In [47]:
%%R -w 800
tbl.ar = tbl.ar %>%
mutate(fraction = as.numeric(fraction))
ggplot(tbl.ar, aes(fraction, abund_range, fill=rep, color=rep)) +
geom_point() +
geom_line() +
labs(x='Buoyant density', y='relative abundanc range') +
facet_grid(treatment ~ day) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [96]:
%%R -i otuTableFile
# loading OTU table
tbl = read.delim(otuTableFile, sep='\t') %>%
select(-ends_with('.NA'))
tbl.h = tbl %>%
gather('sample', 'count', 2:ncol(tbl)) %>%
separate(sample, c('isotope','treatment','day','rep','fraction'), sep='\\.', remove=F)
tbl.h %>% head
In [100]:
%%R
# basename of fractions
samples.to.use.base = gsub('\\.[0-9]+\\.NA', '', samples.to.use)
samps = tbl.h$sample %>% unique
fracs = sapply(samples.to.use.base, function(x) grep(x, samps, value=TRUE))
for (n in names(fracs)){
n.frac = length(fracs[[n]])
cat(n, '-->', 'Number of fraction samples: ', n.frac, '\n')
}
In [103]:
%%R
# function for getting mean OTU abundance from all fractions
OTU.abund = function(samples, otu.long){
otu.rel.abund = otu.long %>%
filter(sample %in% samples,
count > 0) %>%
ungroup() %>%
group_by(sample) %>%
mutate(total_count = sum(count)) %>%
ungroup() %>%
mutate(perc_abund = count / total_count * 100) %>%
group_by(OTUId) %>%
summarize(mean_perc_abund = mean(perc_abund),
median_perc_abund = median(perc_abund),
max_perc_abund = max(perc_abund))
return(otu.rel.abund)
}
## calling function
otu.rel.abund = lapply(fracs, OTU.abund, otu.long=tbl.h)
otu.rel.abund = do.call(rbind, otu.rel.abund) %>% as.data.frame
otu.rel.abund$sample = gsub('\\.[0-9]+$', '', rownames(otu.rel.abund))
otu.rel.abund %>% head
In [118]:
%%R -h 600 -w 900
# plotting
otu.rel.abund.l = otu.rel.abund %>%
gather('abund_stat', 'value', mean_perc_abund, median_perc_abund, max_perc_abund)
otu.rel.abund.l$OTUId = reorder(otu.rel.abund.l$OTUId, -otu.rel.abund.l$value)
ggplot(otu.rel.abund.l, aes(OTUId, value, color=abund_stat)) +
geom_point(shape='O', alpha=0.7) +
scale_y_log10() +
facet_grid(abund_stat ~ sample) +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_blank(),
legend.position = 'none'
)
In [123]:
%%R -i workDir
setwd(workDir)
# each sample is a file
samps = otu.rel.abund.l$sample %>% unique %>% as.vector
for(samp in samps){
outFile = paste(c(samp, 'frac_OTU.txt'), collapse='_')
tbl.p = otu.rel.abund %>%
filter(sample == samp, mean_perc_abund > 0)
write.table(tbl.p, outFile, sep='\t', quote=F, row.names=F)
cat('Table written: ', outFile, '\n')
cat(' Number of OTUs: ', tbl.p %>% nrow, '\n')
}
In [ ]: