Recreating the data in Figure 1 of Lueders et al., 2004
Parameters:
gradient
DNA quantification
In [7]:
workDir = '/home/nick/t/Lueders2004/'
# params
bandwidth = 0.8
In [8]:
import os
#import numpy as np
%load_ext rpy2.ipython
In [10]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
#library(genomes)
In [11]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [15]:
taxa="""Methylobacterium_extorquens_AM1 NC_012808
Methanosarcina_barkeri_MS NZ_CP009528
"""
taxa = taxa.replace(' ', '\t')
genome_file = os.path.join(workDir, 'genome_list.txt')
with open(genome_file, 'w') as oFH:
oFH.write(taxa)
print('File written: {}'.format(genome_file))
In [16]:
!head $genome_file
In [17]:
%%bash -s $genome_file
source activate py27
# downloading genomes
SIPSim genome_download -d genomes -n 2 $1
In [18]:
!ls -thlc genomes/*.fna
In [19]:
# %%R
# data(proks)
# summary(proks)
In [20]:
# %%R
# update(proks)
# summary(proks)
In [21]:
# %%R
# M.ext = subset(proks, name %like% 'Methylobacterium extorquens AM1*')
# M.bark = subset(proks, name %like% 'Methanosarcina barkeri MS*')
# orgs = rbind(M.ext, M.bark)
# orgFile = 'genome_info.txt'
# write.table(orgs, orgFile, sep='\t', quote=FALSE, row.names=FALSE)
# cat('File written:', orgFile, '\n')
In [22]:
# !seqDB_tools accession-GI2fasta \
# -a 11 -n 2 -f 30 -header \
# -o genomes \
# genome_info.txt
In [23]:
#!ls -thlc genomes/*.fna
In [24]:
%%bash
source activate py27
find ./genomes/ -name "*.fna" |\
xargs -P 2 -I % SIPSim genome_rename --prefix genomes_rn %
In [25]:
!grep ">" genomes_rn/*fna | perl -pe 's/.+:>/>/'
In [26]:
# making a genome index file
## taxonName<tab>genomeSeqFileName
!ls -1 genomes_rn/*fna | perl -pe 's/.+\/|\.fna//g; s/(.+)/$1\t$1.fna/' > genome_index.txt
!cat genome_index.txt
In [31]:
%%bash
source activate py27
SIPSim genome_index \
--fp ./genomes_rn/ \
--np 2 \
genome_index.txt \
> genome_index.log
In [32]:
!tail genome_index.log
In [33]:
comm = """library taxon_name rel_abund_perc rank
1 Methanosarcina_barkeri_MS 100.0 1
1 Methylobacterium_extorquens_AM1 0.0 2
2 Methylobacterium_extorquens_AM1 100.0 1
2 Methanosarcina_barkeri_MS 0.0 2
"""
comm_file = 'comm-n2-unif.txt'
with open(comm_file, 'w') as oFH:
oFH.write(comm)
!cat $comm_file
In [34]:
%%bash
source activate py27
SIPSim gradient_fractions \
--BD_min 1.66 \
--BD_max 1.8 \
--params mu:0.0075,sigma:0.001 \
comm-n2-unif.txt \
> fracs-n2-unif.txt
In [35]:
# checking output
!wc -l fracs-n2-unif.txt
!head -n 5 fracs-n2-unif.txt
!tail -n 5 fracs-n2-unif.txt
In [36]:
%%R -w 800 -h 300
# plotting
df = read.delim('fracs-n2-unif.txt', sep='\t')
df %>%
group_by(library) %>%
summarize(n=n()) %>%
print
p = ggplot(df, aes(BD_min, fraction_size, group=library, color=library %>% as.character)) +
geom_point() +
geom_line() +
scale_x_continuous(breaks=seq(1.66, 1.8, 0.01)) +
scale_color_discrete('gradient') +
labs(x='Buoyant density', y='Fraction size (buoyant density)') +
theme_bw() +
theme(
text = element_text(size=16)
)
plot(p)
In [37]:
%%bash
source activate py27
SIPSim fragments \
genome_index.txt \
--fp genomes \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 100X \
--np 2 \
2> shotFrag_skewN90-25-n5-nS.log \
> shotFrag_skewN90-25-n5-nS.pkl
In [38]:
!tail shotFrag_skewN90-25-n5-nS.log
In [41]:
%%bash -s $bandwidth
source activate py27
SIPSim fragment_KDE \
shotFrag_skewN90-25-n5-nS.pkl \
--bw $1 \
> shotFrag_skewN90-25-n5-nS_kde.pkl
In [42]:
!ls -thlc shotFrag_skewN90-25-n5-nS_kde.pkl
In [43]:
%%bash
source activate py27
# getting info on the KDEs
SIPSim KDE_info -s shotFrag_skewN90-25-n5-nS_kde.pkl
In [44]:
# number of samples to draw from KDEs
n=100000
In [45]:
%%bash -s $n
source activate py27
SIPSim KDE_sample \
-n $1 shotFrag_skewN90-25-n5-nS_kde.pkl \
> shotFrag_skewN90-25-n5-nS_kde.txt
In [49]:
%%R -w 600 -h 400
df = read.delim('shotFrag_skewN90-25-n5-nS_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [47]:
%%bash -s $bandwidth
source activate py27
SIPSim diffusion \
shotFrag_skewN90-25-n5-nS_kde.pkl \
--np 2 \
--bw $1 \
> shotFrag_skewN90-25-n5-nS_dif_kde.pkl
In [48]:
!ls -thlc shotFrag_skewN90-25-n5-nS_dif_kde.pkl
In [50]:
# number of samples to draw from KDEs
n=100000
In [52]:
%%bash -s $n
source activate py27
SIPSim KDE_sample \
-n $1 shotFrag_skewN90-25-n5-nS_dif_kde.pkl \
> shotFrag_skewN90-25-n5-nS_dif_kde.txt
In [53]:
%%R -w 600 -h 400
df = read.delim('shotFrag_skewN90-25-n5-nS_dif_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [57]:
%%bash -s $n
source activate py27
SIPSim DBL \
shotFrag_skewN90-25-n5-nS_dif_kde.pkl \
--comm comm-n2-unif.txt \
--commx 0 \
-D 1.725 \
-w 19807714 \
--tube_height 5.1 \
--r_min 7.47 \
--r_max 8.79 \
--vertical \
--np 2 \
> shotFrag_skewN90-25-n5-nS_dif_kde_DBL.pkl
In [58]:
%%R -h 300 -w 400
df = read.delim('DBL_index.txt', sep='\t')
p = ggplot(df, aes(DBL_BD, ymin=vert_gradient_BD_low, ymax=vert_gradient_BD_high)) +
geom_linerange() +
scale_y_reverse() +
labs(x='Diffusive boundary layer buoyant density',
y='CsCl gradient buoyant density',
title='Extent of DBL smearing') +
theme_bw() +
theme(
text = element_text(size=16)
)
plot(p)
In [59]:
n=100000
In [61]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
shotFrag_skewN90-25-n5-nS_dif_kde_DBL.pkl \
> shotFrag_skewN90-25-n5-nS_dif_kde_DBL.txt
In [62]:
%%R -w 600 -h 550
df = read.delim('shotFrag_skewN90-25-n5-nS_dif_kde_DBL.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
facet_grid(libID ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [64]:
# making config file
config = """
[1]
# baseline: no incorp
[[intraPopDist 1]]
distribution = uniform
[[[start]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 0
end = 0
[[[end]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 0
end = 0
[2]
# half of taxa incorporate
max_perc_taxa_incorp = 100
[[intraPopDist 1]]
distribution = uniform
[[[start]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 100
end = 100
[[[end]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 100
end = 100
"""
outfile = os.path.join(workDir, 'incorp.config')
outf = open(outfile, 'w')
outf.write(config)
outf.close()
In [65]:
%%bash
source activate py27
SIPSim isotope_incorp \
shotFrag_skewN90-25-n5-nS_dif_kde_DBL.pkl \
incorp.config \
--comm comm-n2-unif.txt \
--np 2 \
> shotFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.pkl
In [ ]:
n=100000
In [67]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
shotFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.pkl \
> shotFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.txt
In [68]:
%%R -w 600 -h 550
df = read.delim('shotFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
facet_grid(libID ~ .) +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [91]:
%%bash -s
source activate py27
SIPSim OTU_table \
shotFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.pkl \
comm-n2-unif.txt \
fracs-n2-unif.txt \
--abs 1e9 \
--np 2 \
> shotFrag_OTU_n2_abs1e9.txt
In [93]:
!head shotFrag_OTU_n2_abs1e9.txt
!tail shotFrag_OTU_n2_abs1e9.txt
In [129]:
%%R -i workDir
# loading file
F = file.path(workDir, 'shotFrag_OTU_n2_abs1e9.txt')
df = read.delim(F)
df1 = df %>%
filter(library == 1,
taxon == 'Methanosarcina_barkeri_MS')
df2 = df %>%
filter(library == 2,
taxon == 'Methylobacterium_extorquens_AM1')
df = rbind(df1, df2)
df_all = df %>%
group_by(library) %>%
mutate(rel_abund = count / max(count)) %>%
ungroup()
df_all %>% head(n=3) %>% as.data.frame
In [130]:
%%R -w 500 -h 300
# max possible shift for M barkeri (using mean G+C)
max_shift = 1.727326 + 0.036
# plotting
p_all = ggplot(df_all, aes(BD_min, rel_abund, color=taxon)) +
geom_point(size=3.5) +
geom_line() +
scale_x_continuous(limits=c(1.68, 1.78), breaks=seq(1.68, 1.78, 0.02)) +
scale_color_discrete('') +
labs(x='CsCl buoyant density [g ml^-1]', y='Ratio of maximum quantities',
title='All DNA fragments') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
plot(p_all)
In [99]:
# primers
primers = """>Ar109f
ACKGCTCAGTAACACGT
>Ar915r
GTGCTCCCCCGCCAATTCCT
>Ba519f
CAGCMGCCGCGGTAANWC
>Ba907r
CCGTCAATTCMTTTRAGTT
"""
primer_file = 'Ba-Ar_primers.fna'
with open(primer_file, 'w') as outFH:
outFH.write(primers)
!cat $primer_file
In [100]:
%%bash
source activate py27
SIPSim fragments \
genome_index.txt \
--fp genomes_rn \
--fr Ba-Ar_primers.fna \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np 2 \
2> ampFrag_skewN90-25-n5-nS.log \
> ampFrag_skewN90-25-n5-nS.pkl
In [101]:
!tail ampFrag_skewN90-25-n5-nS.log
In [102]:
%%bash -s $bandwidth
source activate py27
SIPSim fragment_KDE \
--bw $1 \
ampFrag_skewN90-25-n5-nS.pkl \
> ampFrag_skewN90-25-n5-nS_kde.pkl
In [103]:
!ls -thlc ampFrag_skewN90-25-n5-nS_kde.pkl
In [104]:
n=100000
In [105]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
ampFrag_skewN90-25-n5-nS_kde.pkl \
> ampFrag_skewN90-25-n5-nS_kde.txt
In [106]:
%%R -w 600 -h 400
df = read.delim('ampFrag_skewN90-25-n5-nS_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [107]:
%%bash -s $bandwidth
source activate py27
SIPSim diffusion \
--np 2 \
--bw $1 \
ampFrag_skewN90-25-n5-nS_kde.pkl \
> ampFrag_skewN90-25-n5-nS_dif_kde.pkl
In [108]:
!ls -thlc ampFrag_skewN90-25-n5-nS_dif_kde.pkl
In [109]:
n=100000
In [110]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
ampFrag_skewN90-25-n5-nS_kde.pkl \
> ampFrag_skewN90-25-n5-nS_kde.txt
In [111]:
%%R -w 600 -h 400
df = read.delim('ampFrag_skewN90-25-n5-nS_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [112]:
%%bash
source activate py27
SIPSim DBL \
ampFrag_skewN90-25-n5-nS_dif_kde.pkl \
--comm comm-n2-unif.txt \
--commx 0 \
-D 1.725 \
-w 19807714 \
--tube_height 5.1 \
--r_min 7.47 \
--r_max 8.79 \
--vertical \
--np 2 \
> ampFrag_skewN90-25-n5-nS_dif_kde_DBL.pkl
In [113]:
%%R -h 300 -w 400
df = read.delim('DBL_index.txt', sep='\t')
p = ggplot(df, aes(DBL_BD, ymin=vert_gradient_BD_low, ymax=vert_gradient_BD_high)) +
geom_linerange() +
scale_y_reverse() +
labs(x='Diffusive boundary layer buoyant density',
y='CsCl gradient buoyant density',
title='Extent of DBL smearing') +
theme_bw() +
theme(
text = element_text(size=16)
)
plot(p)
In [114]:
n=100000
In [115]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
ampFrag_skewN90-25-n5-nS_kde.pkl \
> ampFrag_skewN90-25-n5-nS_kde.txt
In [116]:
%%R -w 600 -h 400
df = read.delim('ampFrag_skewN90-25-n5-nS_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [117]:
%%bash
source activate py27
SIPSim isotope_incorp \
ampFrag_skewN90-25-n5-nS_dif_kde_DBL.pkl \
incorp.config \
--comm comm-n2-unif.txt \
--np 2 \
> ampFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.pkl
In [118]:
n=100000
In [119]:
%%bash -s $n
source activate py27
SIPSim KDE_sample -n $1 \
ampFrag_skewN90-25-n5-nS_kde.pkl \
> ampFrag_skewN90-25-n5-nS_kde.txt
In [120]:
%%R -w 600 -h 400
df = read.delim('ampFrag_skewN90-25-n5-nS_kde.txt', sep='\t') %>%
gather(taxon, Buoyant_density, Methanosarcina_barkeri_MS, Methylobacterium_extorquens_AM1)
p = ggplot(df, aes(Buoyant_density, fill=taxon)) +
geom_density(alpha=0.5) +
scale_x_continuous(limits=c(1.68, 1.78)) +
scale_fill_discrete('') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
pl = p + scale_y_log10()
grid.arrange(p, pl, ncol=1)
In [121]:
%%bash
source activate py27
SIPSim OTU_table \
ampFrag_skewN90-25-n5-nS_dif_kde_DBL_incorp.pkl \
comm-n2-unif.txt \
fracs-n2-unif.txt \
--abs 1e9 \
--np 2 \
> ampFrag_OTU_n2_abs1e9.txt
In [128]:
!head ampFrag_OTU_n2_abs1e9.txt
In [132]:
%%R -i workDir
# loading file
F = file.path(workDir, 'ampFrag_OTU_n2_abs1e9.txt')
df = read.delim(F)
df1 = df %>%
filter(library == 1,
taxon == 'Methanosarcina_barkeri_MS')
df2 = df %>%
filter(library == 2,
taxon == 'Methylobacterium_extorquens_AM1')
df = rbind(df1, df2)
df_16S = df %>%
group_by(library) %>%
mutate(rel_abund = count / max(count)) %>%
ungroup()
df_16S %>% head(n=3) %>% as.data.frame
In [133]:
%%R -w 500 -h 300
# max possible shift for M barkeri (using mean G+C)
max_shift = 1.727326 + 0.036
# plotting
p_16S = ggplot(df.16S, aes(BD_min, rel_abund, color=taxon)) +
geom_point(size=3.5) +
geom_line() +
scale_x_continuous(limits=c(1.68, 1.78), breaks=seq(1.68, 1.78, 0.02)) +
scale_color_discrete('') +
labs(x='CsCl buoyant density [g ml^-1]', y='Ratio of maximum quantities',
title='DNA fragments containing 16S rRNA genes') +
theme_bw() +
theme(
text = element_text(size=16),
legend.position = 'bottom'
)
plot(p_16S)
In [135]:
%%R
# simulation data
tmp1 = df_all %>%
dplyr::select(taxon, BD_min, count) %>%
mutate(DNA = 'Total DNA')
tmp2 = df_16S %>%
dplyr::select(taxon, BD_min, count) %>%
mutate(DNA = '16S rRNA genes')
df_j = rbind(tmp1, tmp2) %>%
group_by(taxon, DNA) %>%
mutate(rel_abund = count / max(count)) %>%
ungroup() %>%
mutate(taxon = gsub('Methanosarcina_barkeri_MS', 'M. barkeri MS', taxon),
taxon = gsub('Methylobacterium_extorquens_AM1', 'M. extorquens AM1', taxon)) %>%
dplyr::select(-count) %>%
mutate(dataset='simulation')
colnames(df_j) = c('Taxon', 'BD', 'DNA', 'conc', 'dataset')
tmp1 = tmp2 = NULL
# status
df_j %>% tail(n=3)
In [137]:
%%R -w 550 -h 400
df_j$DNA = factor(df_j$DNA, levels=c('Total DNA', '16S rRNA genes'))
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
# plotting
p = ggplot(df_j, aes(BD, conc, shape=Taxon)) +
geom_point(size=3) +
geom_line() +
scale_color_discrete('Dataset') +
scale_x_continuous(limits=c(1.68, 1.78),
breaks=seq(1.68, 1.78, 0.02),
expand=c(0.001,0.001)) +
labs(x=x.lab, y='Ratio of maximum quantities') +
facet_grid(DNA ~ .) +
theme_bw() +
theme(
text = element_text(size=16)
)
p
In [ ]: