In [29]:
workDir = '/home/nick/notebook/SIPSim/dev/priming_exp/validation_sample/X12C.700.14.05_default/'
genomeDir = '/home/nick/notebook/SIPSim/dev/priming_exp/genomes/'
allAmpFrags = '/home/nick/notebook/SIPSim/dev/bac_genome1210/validation/ampFrags.pkl'
otuTableFile = '/var/seq_data/priming_exp/data/otu_table.txt'
metaDataFile = '/var/seq_data/priming_exp/data/allsample_metadata_nomock.txt'
primerFile = '/home/nick/notebook/SIPSim/dev/515F-806R.fna'
cdhit_dir = '/home/nick/notebook/SIPSim/dev/priming_exp/CD-HIT/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figureDir = '/home/nick/notebook/SIPSim/figures/'
# simulation params
comm_richness = 2170
seq_per_fraction = ['lognormal', 10.096, 1.116]
# for making genome_map file for genome fragment simulation
taxonMapFile = os.path.join(cdhit_dir, 'target_taxa.txt')
genomeFilterFile = os.path.join(cdhit_dir, 'genomeFile_seqID_filt.txt')
abundFile = os.path.join(workDir, 'X12C.700.14.05.NA_OTU.txt')
# misc
nprocs = 20
In [7]:
import glob
import cPickle as pickle
import copy
from IPython.display import Image
In [8]:
%load_ext rpy2.ipython
In [9]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [10]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [11]:
%%R -i otuTableFile
tbl.otu = read.delim(otuTableFile, sep='\t')
tbl.otu[1:3,1:5]
In [12]:
%%R
tbl.otu.w = tbl.otu %>%
gather('sample', 'count', 2:ncol(tbl.otu)) %>%
group_by(sample) %>%
mutate(rel_abund_perc = count / sum(count) * 100) %>%
ungroup %>%
filter(count > 0, sample == 'X12C.700.14.05.NA') %>%
mutate(library = '1', rank = row_number(-rel_abund_perc)) %>%
select(library, 'taxon_name' = OTUId, rel_abund_perc, rank) %>%
arrange(rank)
tbl.otu.w %>% head(n=5)
In [14]:
%%R -i comm_richness
# number of OTUs
n.OTUs = tbl.otu.w$taxon_name %>% unique %>% length
cat('Number of OTUs: ', n.OTUs, '\n')
# assertion
cat('Community richness = number of OTUs? ', comm_richness == n.OTUs, '\n')
In [15]:
%%R -i workDir
commFile = paste(c(workDir, 'comm.txt'), collapse='/')
write.table(tbl.otu.w, commFile, sep='\t', quote=F, row.names=F)
In [16]:
%%R -i workDir
commFile = paste(c(workDir, 'comm.txt'), collapse='/')
comm = read.delim(commFile, sep='\t')
comm %>% head
In [17]:
%%R -w 900 -h 350
ggplot(comm, aes(rank, rel_abund_perc)) +
geom_point() +
labs(x='Rank', y='% relative abundance', title='Priming experiment community abundance distribution') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [19]:
%%R -i taxonMapFile -i genomeFilterFile
taxonMap = read.delim(taxonMapFile, sep='\t') %>%
select(target_genome, OTU) %>%
distinct()
taxonMap %>% nrow %>% print
taxonMap %>% head(n=3) %>% print
breaker = '----------------\n'
cat(breaker)
genomeFilter = read.delim(genomeFilterFile, sep='\t', header=F)
genomeFilter %>% nrow %>% print
genomeFilter %>% head(n=3) %>% print
cat(breaker)
comm = read.delim(commFile, sep='\t')
comm %>% nrow %>% print
comm %>% head(n=3) %>% print
In [20]:
%%R
taxonMap$OTU %>% table %>% sort(decreasing=T) %>% head
In [21]:
%%R
tbl.j = inner_join(taxonMap, genomeFilter, c('target_genome' = 'V1')) %>%
rename('fasta_file' = V2) %>%
select(OTU, fasta_file, target_genome)
tbl.j %>% head(n=3)
In [22]:
%%R
tbl.j$OTU %>% table %>% sort(decreasing=T) %>% head
In [25]:
%%R
tbl.j2 = inner_join(tbl.j, comm, c('OTU' = 'taxon_name'))
n.target.genomes = tbl.j2$OTU %>% unique %>% length
cat('Number of target OTUs: ', n.target.genomes, '\n')
cat('--------', '\n')
tbl.j2 %>% head(n=3)
In [26]:
%%R -i workDir
outFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')
write.table(tbl.j2, outFile, sep='\t', quote=F, row.names=F, col.names=F)
In [27]:
%%R -w 900 -h 350
ggplot(tbl.j2, aes(rank, rel_abund_perc)) +
geom_point(size=3, shape='O', color='red') +
labs(x='Rank', y='% relative abundance', title='Priming experiment community abundance distribution') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [30]:
!cd $workDir; \
SIPSim fragments \
target_genome_index.txt \
--fp $genomeDir \
--fr $primerFile \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np $nprocs \
2> ampFrags.log \
> ampFrags.pkl
In [284]:
%%R -i workDir
# loading files
## target genome index (just OTUs with associated genome)
inFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')
tbl.target = read.delim(inFile, sep='\t', header=F)
colnames(tbl.target) = c('OTUId', 'fasta_file', 'genome_name')
## comm file of total community OTUs
commFile = paste(c(workDir, 'comm.txt'), collapse='/')
tbl.comm = read.delim(commFile, sep='\t')
In [285]:
%%R
# just OTUs w/out an associated genome
tbl.j = anti_join(tbl.comm, tbl.target, c('taxon_name' = 'OTUId'))
n.nontarget.genomes = tbl.j$taxon_name %>% length
message('Number of non-target genomes: ', n.nontarget.genomes)
message('---------')
tbl.j %>% head(n=5)
In [286]:
%%R -i comm_richness
# checking assumptions
message('Target + nonTarget richness = total community richness?: ',
n.target.genomes + n.nontarget.genomes == comm_richness)
In [287]:
%%R -i workDir
# writing out non-target OTU file
outFile = paste(c(workDir, 'comm_nonTarget.txt'), collapse='/')
write.table(tbl.j, outFile, sep='\t', quote=F, row.names=F)
In [288]:
# List of non-target OTUs
inFile = os.path.join(workDir, 'comm_nonTarget.txt')
nonTarget = pd.read_csv(inFile, sep='\t')['taxon_name'].tolist()
print 'Number of non-target OTUs: {}'.format(len(nonTarget))
nonTarget[:4]
Out[288]:
In [289]:
# loading amplicon fragments from full genome KDE dataset
inFile = os.path.join(workDir, 'ampFrags.pkl')
ampFrag_target = []
with open(inFile, 'rb') as iFH:
ampFrag_target = pickle.load(iFH)
print 'Target OTU richness: {}'.format(len(ampFrag_target))
In [290]:
# loading amplicon fragments from full genome KDE dataset
ampFrag_all = []
with open(allAmpFrags, 'rb') as iFH:
ampFrag_all = pickle.load(iFH)
print 'Count of frag-GC KDEs for all genomes: {}'.format(len(ampFrag_all))
In [291]:
# random selection from list
#target_richness = len(ampFrag_target)
target_richness = len(ampFrag_target)
richness_needed = comm_richness - target_richness
print 'Number of random taxa needed to reach richness: {}'.format(richness_needed)
if richness_needed > 0:
index = range(target_richness)
index = np.random.choice(index, richness_needed)
ampFrag_rand = []
for i in index:
sys.stderr.write('{},'.format(i))
ampFrag_rand.append(copy.deepcopy(ampFrag_all[i]))
else:
ampFrag_rand = []
In [292]:
# renaming randomly selected KDEs by non-target OTU-ID
for i in range(len(ampFrag_rand)):
ampFrag_rand[i][0] = nonTarget[i]
In [293]:
# appending random taxa to target taxa and writing
outFile = os.path.join(workDir, 'ampFrags_wRand.pkl')
with open(outFile, 'wb') as oFH:
x = ampFrag_target + ampFrag_rand
print 'Number of taxa in output: {}'.format(len(x))
pickle.dump(x, oFH)
In [ ]:
!cd $workDir; \
SIPSim fragment_kde \
ampFrags_wRand.pkl \
> ampFrags_wRand_kde.pkl
In [ ]:
!cd $workDir; \
SIPSim diffusion \
ampFrags_wRand_kde.pkl \
--np $nprocs \
> ampFrags_wRand_kde_dif.pkl
In [ ]:
!cd $workDir; \
SIPSim incorpConfigExample \
--percTaxa 0 \
--percIncorpUnif 100 \
> PT0_PI100.config
In [ ]:
!cd $workDir; \
SIPSim isotope_incorp \
ampFrags_wRand_kde_dif.pkl \
PT0_PI100.config \
--comm comm.txt \
--np $nprocs \
> ampFrags_wRand_kde_dif_incorp.pkl
In [ ]:
!cd $workDir; \
SIPSim BD_shift \
ampFrags_wRand_kde_dif.pkl \
ampFrags_wRand_kde_dif_incorp.pkl \
--np $nprocs \
> ampFrags_wRand_kde_dif_incorp_BD-shift.txt
In [ ]:
!cd $workDir; \
SIPSim gradient_fractions \
comm.txt \
> fracs.txt
In [ ]:
!cd $workDir; \
SIPSim OTU_table \
ampFrags_wRand_kde_dif_incorp.pkl \
comm.txt \
fracs.txt \
--abs 1e9 \
--np $nprocs \
> OTU_abs1e9.txt
In [311]:
%%R -i workDir
setwd(workDir)
# loading file
tbl = read.delim('OTU_abs1e9.txt', sep='\t')
In [45]:
%%R
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
In [313]:
%%R -w 800 -h 300
# plotting absolute abundances
tbl.s = tbl %>%
group_by(library, BD_mid) %>%
summarize(total_count = sum(count))
## plot
p = ggplot(tbl.s, aes(BD_mid, total_count)) +
geom_area(stat='identity', alpha=0.3, position='dodge') +
geom_histogram(stat='identity') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16)
)
p
In [314]:
%%R -w 800 -h 300
# plotting number of taxa at each BD
tbl.nt = tbl %>%
filter(count > 0) %>%
group_by(library, BD_mid) %>%
summarize(n_taxa = n())
## plot
p = ggplot(tbl.nt, aes(BD_mid, n_taxa)) +
geom_area(stat='identity', alpha=0.3, position='dodge') +
geom_histogram(stat='identity') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [315]:
%%R -w 800 -h 250
# plotting relative abundances
## plot
p = ggplot(tbl, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity', position='dodge', alpha=0.5)
In [316]:
%%R -w 800 -h 250
p + geom_area(stat='identity', position='fill')
In [37]:
dist,loc,scale = seq_per_fraction
!cd $workDir; \
SIPSim OTU_subsample \
--dist $dist \
--dist_params mean:$loc,sigma:$scale \
--walk 1 \
OTU_abs1e9.txt \
> OTU_abs1e9_sub.txt
In [38]:
%%R -h 300
setwd(workDir)
tbl = read.csv('OTU_abs1e9_sub.txt', sep='\t')
tbl.s = tbl %>%
group_by(library, fraction) %>%
summarize(total_count = sum(count)) %>%
ungroup() %>%
mutate(library = as.character(library))
ggplot(tbl.s, aes(total_count)) +
geom_density(fill='blue')
In [39]:
%%R -h 300 -w 600
setwd(workDir)
tbl.s = tbl %>%
group_by(fraction, BD_min, BD_mid, BD_max) %>%
summarize(total_count = sum(count))
ggplot(tbl.s, aes(BD_mid, total_count)) +
geom_point() +
geom_line() +
labs(x='Buoyant density', y='Total sequences') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [41]:
%%R -i workDir
inFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')
tbl.target = read.delim(inFile, sep='\t', header=F)
colnames(tbl.target) = c('OTUId', 'genome_file', 'genome_ID', 'X', 'Y', 'Z')
tbl.target = tbl.target %>% distinct(OTUId)
cat('Number of target OTUs: ', tbl.target$OTUId %>% unique %>% length, '\n')
cat('----------\n')
tbl.target %>% head(n=3)
In [46]:
%%R -w 800 -h 250
# plotting relative abundances
tbl = tbl %>%
group_by(fraction) %>%
mutate(rel_abund = count / sum(count))
## plot
p = ggplot(tbl, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity', position='dodge', alpha=0.5)
In [47]:
%%R -w 800 -h 250
p = ggplot(tbl, aes(BD_mid, rel_abund, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity')
In [48]:
%%R
targets = tbl.target$OTUId %>% as.vector %>% unique
tbl.f = tbl %>%
filter(taxon %in% targets)
tbl.f %>% head
In [49]:
%%R -w 800 -h 250
# plotting absolute abundances
## plot
p = ggplot(tbl.f, aes(BD_mid, count, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity', position='dodge', alpha=0.5)
In [50]:
%%R -w 800 -h 250
# plotting relative abundances
p = ggplot(tbl.f, aes(BD_mid, rel_abund, fill=taxon)) +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p + geom_area(stat='identity')
In [51]:
%%R -i metaDataFile
# loading priming_exp metadata file
meta = read.delim(metaDataFile, sep='\t')
meta %>% head(n=4)
In [52]:
%%R -i otuTableFile
# loading priming_exp OTU table
tbl.otu.true = read.delim(otuTableFile, sep='\t') %>%
select(OTUId, starts_with('X12C.700.14'))
tbl.otu.true %>% head(n=3)
In [53]:
%%R
# editing table
tbl.otu.true.w = tbl.otu.true %>%
gather('sample', 'count', 2:ncol(tbl.otu.true)) %>%
mutate(sample = gsub('^X', '', sample)) %>%
group_by(sample) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
filter(count > 0)
tbl.otu.true.w %>% head(n=5)
In [54]:
%%R
tbl.true.j = inner_join(tbl.otu.true.w, meta, c('sample' = 'Sample'))
tbl.true.j %>% as.data.frame %>% head(n=3)
In [55]:
%%R -w 800 -h 300
# plotting number of taxa at each BD
tbl.nt = tbl %>%
filter(count > 0) %>%
group_by(library, BD_mid) %>%
summarize(n_taxa = n())
## plot
p = ggplot(tbl.nt, aes(BD_mid, n_taxa)) +
geom_area(stat='identity', alpha=0.5) +
geom_point() +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
p
In [56]:
%%R -w 700 -h 350
tbl.true.j.s = tbl.true.j %>%
filter(count > 0) %>%
group_by(sample, Density) %>%
summarize(n_taxa = sum(count > 0))
ggplot(tbl.true.j.s, aes(Density, n_taxa)) +
geom_area(stat='identity', alpha=0.5) +
geom_point() +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density', y='Number of taxa') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [57]:
%%R -h 300 -w 600
tbl.true.j.s = tbl.true.j %>%
group_by(sample, Density) %>%
summarize(total_count = sum(count))
ggplot(tbl.true.j.s, aes(Density, total_count)) +
geom_point() +
geom_line() +
labs(x='Buoyant density', y='Total sequences') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [58]:
%%R
tbl.true.j.f = tbl.true.j %>%
filter(OTUId %in% targets) %>%
arrange(OTUId, Density) %>%
group_by(sample)
tbl.true.j.f %>% head(n=3) %>% as.data.frame
In [59]:
%%R -w 800 -h 250
# plotting relative abundances
## plot
ggplot(tbl.true.j.f, aes(Density, rel_abund, fill=OTUId)) +
geom_area(stat='identity') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
labs(x='Buoyant density') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'none'
)
In [60]:
%%R
tbl.f.e = tbl.f %>%
mutate(library = 'simulation') %>%
rename('density' = BD_mid) %>%
select(-BD_min, -BD_max)
tbl.true.e = tbl.true.j.f %>%
select('taxon' = OTUId,
'fraction' = sample,
'density' = Density,
count, rel_abund) %>%
mutate(library = 'true')
tbl.sim.true = rbind(tbl.f.e, tbl.true.e)
tbl.f.e = data.frame()
tbl.true.e = data.frame()
tbl.sim.true %>% head(n=3)
In [62]:
%%R
# check
cat('Number of target taxa: ', tbl.sim.true$taxon %>% unique %>% length, '\n')
In [ ]:
%%R -w 900 -h 3000
tbl.sim.true = tbl.sim.true %>%
ungroup() %>%
group_by(taxon) %>%
mutate(mean_rel_abund = mean(rel_abund)) %>%
ungroup()
tbl.sim.true$taxon = reorder(tbl.sim.true$taxon, -tbl.sim.true$mean_rel_abund)
ggplot(tbl.sim.true, aes(density, rel_abund, color=library)) +
geom_point() +
geom_line() +
theme_bw() +
facet_wrap(~ taxon, ncol=4, scales='free_y')
In [ ]:
In [ ]:
In [ ]:
In [504]:
def partial_shuffle(x, n, frac=False):
# if frac is True, n assumed to be fraction of len(x)
if frac is True:
n = int(round(len(x) * frac,0))
else:
n = int(n)
# x = list
try:
x = list(x)
except TypeError:
msg = 'x must be a list-like object'
raise TypeError, msg
# making list of random indices
rand = lambda x: random.randrange(0,len(x),1)
randints = lambda x,n: [rand(x) for i in xrange(n)]
rand_i = zip(randints(x,n), randints(x,n))
# suffle
for i,j in rand_i:
tmp = x[i]
x[i] = x[j]
x[j] = tmp
return x
In [506]:
partial_shuffle(range(20), 2.0)
Out[506]:
In [507]:
import scipy.stats as ss
In [582]:
def shuffle_walk(x, max_walk):
# x = list
try:
x = list(x)
except TypeError:
msg = 'x must be a list-like object'
raise TypeError, msg
x_len = len(x)
# ranks of values
## (index, value, value_rank)
ranks = zip(range(x_len), x, ss.rankdata(x))
# starting range
start_i = random.randrange(0,x_len,1)
cur_rank = ranks[start_i]
# filtering cur_rank from ranks
ranks = [x for x in ranks if x[0] != cur_rank[0]]
# moving through ranks
x_new_order = [cur_rank[1]]
for i in xrange(x_len-1):
# select max walk distance
max_range = random.randrange(1,max_walk+1)
# filter to just ranks w/in rank distance
filt_ranks = [x for x in ranks
if abs(x[2] - cur_rank[2]) <= max_range]
# selecting randomly from filtered ranks
cur_rank = random.sample(filt_ranks, k=1)[0]
# adding value to new list
x_new_order.append(cur_rank[1])
# filtering cur_rank from ranks
ranks = [x for x in ranks if x[0] != cur_rank[0]]
# re-ranking remaining values
rank_vals = [x[2] for x in ranks]
new_ranks = ss.rankdata(rank_vals)
for i,v in enumerate(new_ranks):
ranks[i] = (ranks[i][0], ranks[i][1], v)
return x_new_order
In [637]:
vals = range(1, 100)
sw = shuffle_walk(vals, 3)
random.shuffle(vals)
In [638]:
%%R -w 700 -h 300 -i sw -i vals
tbl = data.frame('shuffle_walk' = sw,
'shuffle' = vals)
#ggplot(tbl, aes(shuffle_walk, shuffle)) +
# geom_point()
tbl = tbl %>%
mutate(order = 1:nrow(tbl)) %>%
gather('algorithim', 'value', 1:2)
ggplot(tbl, aes(order, value, color=algorithim)) +
geom_point() +
geom_line() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [ ]: