Goal

Simulating fullCyc Day1 control gradients
- Not simulating incorporation (all 0% isotope incorp.)
  - Don't know how much true incorporatation for emperical data
Simulating 10 replicate gradients to assess simulation stochasticity.
Added richness will be from taxa that overlap with reference genome pool (target taxa)

Init



In [10]:

    
import os
import glob
import re
import nestly



In [11]:

    
%load_ext rpy2.ipython
%load_ext pushnote









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython



In [12]:

    
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)

## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66

Nestly

assuming fragments already simulated



In [52]:

    
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'Day1_richFromTarget_rep10')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile= '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags.pkl'
targetFile = '/home/nick/notebook/SIPSim/dev/fullCyc/CD-HIT/target_taxa.txt'

physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'

nreps = 10
prefrac_comm_abundance = '1e9'
richness = 2503   # chao1 estimate for bulk Day 1

seq_per_fraction = ['lognormal', 9.432, 0.5, 10000, 30000] # dist, mean, scale, min, max
bulk_days = [1]
nprocs = 24



In [6]:

    
# building tree structure
nest = nestly.Nest()

## varying params
nest.add('rep', [x + 1 for x in xrange(nreps)])

## set params
nest.add('bulk_day', bulk_days, create_dir=False)
nest.add('abs', [prefrac_comm_abundance], create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [nprocs], create_dir=False)
nest.add('richness', [richness], create_dir=False)
nest.add('subsample_dist', [seq_per_fraction[0]], create_dir=False)
nest.add('subsample_mean', [seq_per_fraction[1]], create_dir=False)
nest.add('subsample_scale', [seq_per_fraction[2]], create_dir=False)
nest.add('subsample_min', [seq_per_fraction[3]], create_dir=False)
nest.add('subsample_max', [seq_per_fraction[4]], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('targetFile', [targetFile], create_dir=False)
nest.add('physeqDir', [physeqDir], create_dir=False)
nest.add('physeq_bulkCore', [physeq_bulkCore], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')



In [7]:

    
%%writefile $bashFile
#!/bin/bash

export PATH={R_dir}:$PATH

#-- making DNA pool similar to gradient of interest
echo '# Creating comm file from phyloseq'
phyloseq2comm.r {physeqDir}{physeq_bulkCore} -s 12C-Con -d {bulk_day} > {physeq_bulkCore}_comm.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm.txt

echo '## Adding target taxa to comm file'
comm_add_target.r {physeq_bulkCore}_comm.txt {targetFile} > {physeq_bulkCore}_comm_target.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm_target.txt

echo '# Adding extra richness to community file'
printf "1\t{richness}\n" > richness_needed.txt
### making a comm file of just target OTUs; these will be used to add richness
### 
perl -ne 'print unless /NA\tNA\tNA\tNA/' {physeq_bulkCore}_comm_target.txt > richness_pool.txt 
comm_add_richness.r -s richness_pool.txt richness_needed.txt > {physeq_bulkCore}_comm_all.txt
### renaming comm file for downstream pipeline
cat {physeq_bulkCore}_comm_all.txt > {physeq_bulkCore}_comm_target.txt
rm -f {physeq_bulkCore}_comm_all.txt 

echo '## parsing out genome fragments to make simulated DNA pool resembling the gradient of interest'
## all OTUs without an associated reference genome will be assigned a random reference (of the reference genome pool)
### this is done through --NA-random
SIPSim fragment_KDE_parse {fragFile} {physeq_bulkCore}_comm_target.txt \
    --rename taxon_name --NA-random > fragsParsed.pkl


echo '#-- SIPSim pipeline --#'
echo '# converting fragments to KDE'
SIPSim fragment_KDE \
    fragsParsed.pkl \
    > fragsParsed_KDE.pkl
    
echo '# adding diffusion'    
SIPSim diffusion \
    fragsParsed_KDE.pkl \
    --np {np} \
    > fragsParsed_KDE_dif.pkl    

echo '# adding DBL contamination'
SIPSim DBL \
    fragsParsed_KDE_dif.pkl \
    --np {np} \
    > fragsParsed_KDE_dif_DBL.pkl      
    
echo '# making incorp file'
SIPSim incorpConfigExample \
  --percTaxa {percTaxa} \
  --percIncorpUnif {percIncorp} \
  > {percTaxa}_{percIncorp}.config

echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    fragsParsed_KDE_dif_DBL.pkl \
    {percTaxa}_{percIncorp}.config \
    --comm {physeq_bulkCore}_comm_target.txt \
    --np {np} \
    > fragsParsed_KDE_dif_DBL_inc.pkl
    
#echo '# calculating BD shift from isotope incorporation'
#SIPSim BD_shift \
#    fragsParsed_KDE_dif_DBL.pkl \
#    fragsParsed_KDE_dif_DBL_inc.pkl \
#    --np {np} \
#    > fragsParsed_KDE_dif_DBL_inc_BD-shift.txt

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    {physeq_bulkCore}_comm_target.txt \
    > fracs.txt 

echo '# simulating an OTU table'
SIPSim OTU_table \
    fragsParsed_KDE_dif_DBL_inc.pkl \
    {physeq_bulkCore}_comm_target.txt \
    fracs.txt \
    --abs {abs} \
    --np {np} \
    > OTU_abs{abs}.txt
    
#echo '# simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt









    



Writing /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10_richFromTarget/SIPSimRun.sh



In [ ]:

    
!chmod 777 $bashFile
!cd $workDir; \
    nestrun  --template-file $bashFile -d Day1_richFromTarget_rep10 --log-file log.txt -j 1









    



2016-01-25 13:08:24,897 * INFO * Template: ./SIPSimRun.sh
2016-01-25 13:08:24,899 * INFO * [12107] Started ./SIPSimRun.sh in Day1_rep10_richFromTarget/7
2016-01-25 13:50:34,858 * INFO * [12107] Day1_rep10_richFromTarget/7 Finished with 0
2016-01-25 13:50:34,905 * INFO * [15466] Started ./SIPSimRun.sh in Day1_rep10_richFromTarget/9
2016-01-25 14:30:35,337 * INFO * [15466] Day1_rep10_richFromTarget/9 Finished with 0
2016-01-25 14:30:35,370 * INFO * [18288] Started ./SIPSimRun.sh in Day1_rep10_richFromTarget/8
2016-01-25 15:11:49,250 * INFO * [18288] Day1_rep10_richFromTarget/8 Finished with 0
2016-01-25 15:11:49,279 * INFO * [21123] Started ./SIPSimRun.sh in Day1_rep10_richFromTarget/3
2016-01-25 19:48:17,780 * INFO * [29328] Day1_rep10_richFromTarget/6 Finished with 0

BD min/max

what is the min/max BD that we care about?



In [53]:

    
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')









    





Min BD: 1.67323 
Max BD: 1.7744

Loading data

Emperical

SIP data



In [54]:

    
%%R 
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'



In [55]:

    
%%R -w 800 -h 300 

## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
    as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%    
    gather(sample, abundance, 1:(ncol(df.EMP)-1)) 

df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample')) 

df.EMP.nt = df.EMP %>%
    group_by(sample) %>%
    mutate(n_taxa = sum(abundance > 0)) %>%
    ungroup() %>%
    distinct(sample) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.EMP.nt %>% head(n=3)









    





Source: local data frame [3 x 20]

       OTU            sample abundance primer_number fwd_barcode rev_barcode
     (chr)             (chr)     (dbl)         (int)      (fctr)      (fctr)
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1101 12C-Con.D1.R2_F18         0           129    CTACTATA    TGAGTACG
3 OTU.1101 12C-Con.D1.R2_F20         1           131    AGAGTCAC    TGAGTACG
Variables not shown: Substrate (fctr), Day (int), Microcosm_replicate (int),
  Fraction (int), Buoyant_density (dbl), Sample_type (fctr), library (fctr),
  Exp_type (fctr), Sample_location (lgl), Sample_date (lgl), Sample_treatment
  (lgl), Sample_subtreatment (lgl), core_dataset (lgl), n_taxa (int)

bulk soil samples



In [56]:

    
%%R -i physeqDir -i physeq_SIP_core -i bulk_days

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' & 
                                physeq.SIP.core.m$Day %in% bulk_days, 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [57]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [58]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [59]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    dplyr::select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [60]:

    
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD) 
    
df.EMP.j %>% head(n=3)









    





       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1130 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
3 OTU.9833 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   1                   2       23         1.69362     unknown
2   12C-Con   1                   2       23         1.69362     unknown
3   12C-Con   1                   2       23         1.69362     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150721_V4_Lib4      SIP              NA          NA               NA
2 150721_V4_Lib4      SIP              NA          NA               NA
3 150721_V4_Lib4      SIP              NA          NA               NA
  Sample_subtreatment core_dataset   bulk_abund
1                  NA         TRUE 1.234263e-04
2                  NA         TRUE 6.171316e-05
3                  NA         TRUE 0.000000e+00

Simulated



In [61]:

    
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files









    Out[61]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/7/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/9/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/8/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/3/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/10/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/2/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/1/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/4/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/5/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/6/OTU_abs1e9_PCR_sub.txt']



In [62]:

    
%%R -i OTU_files
# loading files

df.SIM = list()
for (x in OTU_files){
    SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/', '', x)
    SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
    df.SIM[[SIM_rep]] = read.delim(x, sep='\t') 
    }
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head









    





  library    fraction taxon BD_min BD_mid BD_max count   rel_abund SIM_rep
1       1  -inf-1.660 OTU.1   -Inf  1.659  1.659    63 0.004588158       7
2       1 1.660-1.667 OTU.1  1.660  1.663  1.667    82 0.004953186       7
3       1 1.667-1.672 OTU.1  1.667  1.669  1.672    52 0.004257758       7
4       1 1.672-1.676 OTU.1  1.672  1.674  1.676    25 0.001811725       7
5       1 1.676-1.680 OTU.1  1.676  1.678  1.680    55 0.005192108       7
6       1 1.680-1.683 OTU.1  1.680  1.681  1.683    84 0.003510092       7



In [63]:

    
%%R
## edit table
df.SIM.nt = df.SIM %>%
    filter(count > 0) %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(n_taxa = n()) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
df.SIM.nt %>% head









    





Source: local data frame [6 x 4]
Groups: SIM_rep, library [1]

  SIM_rep library BD_mid n_taxa
    (chr)   (int)  (dbl)  (int)
1       1       1  1.674   1700
2       1       1  1.679   1785
3       1       1  1.683   1755
4       1       1  1.688    686
5       1       1  1.691    575
6       1       1  1.694    693

'bulk soil' community files



In [64]:

    
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files









    Out[64]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/7/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/9/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/8/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/3/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/10/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/2/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/1/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/4/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/5/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_richFromTarget_rep10/6/bulk-core_comm_target.txt']



In [65]:

    
%%R -i comm_files

df.comm = list()
for (f in comm_files){
    rep = gsub('.+/Day1_richFromTarget_rep10/([0-9]+)/.+', '\\1', f)
    df.comm[[rep]] = read.delim(f, sep='\t') %>%
        dplyr::select(library, taxon_name, rel_abund_perc) %>%
        rename('bulk_abund' = rel_abund_perc) %>%
        mutate(bulk_abund = bulk_abund / 100)
}

df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)









    





  library taxon_name bulk_abund SIM_rep
1       1      OTU.2 0.05856579       7
2       1      OTU.1 0.02801777       7
3       1      OTU.5 0.02715379       7



In [66]:

    
%%R
## joining tables
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
                                         'library' = 'library',
                                         'taxon' = 'taxon_name')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)









    





  library    fraction taxon BD_min BD_mid BD_max count   rel_abund SIM_rep
1       1 1.672-1.676 OTU.1  1.672  1.674  1.676    25 0.001811725       7
2       1 1.676-1.680 OTU.1  1.676  1.678  1.680    55 0.005192108       7
3       1 1.680-1.683 OTU.1  1.680  1.681  1.683    84 0.003510092       7
  bulk_abund
1 0.02801777
2 0.02801777
3 0.02801777



In [67]:

    
%%R 
# filtering & combining emperical w/ simulated data

## emperical 
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(SIM_rep, taxon) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    rename('OTU' = taxon) %>%
    mutate(dataset = 'simulated')

## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    filter(BD_range_perc > 0,
            mean_rel_abund > 0)

df.j$SIM_rep = reorder(df.j$SIM_rep, df.j$SIM_rep %>% as.numeric)

df.j %>% head(n=3)









    





Source: local data frame [3 x 8]

      OTU mean_rel_abund   min_BD   max_BD   BD_range BD_range_perc   dataset
    (chr)          (dbl)    (dbl)    (dbl)      (dbl)         (dbl)     (chr)
1   OTU.1    0.028017773 1.676135 1.773391 0.09725564           100 emperical
2  OTU.10    0.001172550 1.676135 1.773391 0.09725564           100 emperical
3 OTU.100    0.001049124 1.676135 1.773391 0.09725564           100 emperical
Variables not shown: SIM_rep (fctr)



In [68]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.3) +
    scale_x_log10() +
    scale_y_continuous() +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

BD span of just overlapping taxa

Taxa overlapping between emperical data and genomes in dataset
These taxa should have the same relative abundances in both datasets.
- The comm file was created from the emperical dataset phyloseq file.



In [69]:

    
%%R -i targetFile

df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)









    





[1] 194
  cluster
1     212
2     762
3     663
                                                                                                   ssu_ID
1          rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-
2 rRNA_NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207_1746046-1747518_DIR+
3  rRNA_NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053_4003859-4005353_DIR-
                                                                                         genome_fileID
1      /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Acinetobacter_oleivorans_DR1.fna
2 /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Mesorhizobium_australicum_WSM2073.fna
3  /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Dyadobacter_fermentans_DSM_18053.fna
                           genomeID
1      Acinetobacter_oleivorans_DR1
2 Mesorhizobium_australicum_WSM2073
3  Dyadobacter_fermentans_DSM_18053
                                                                   genome_seqID
1          NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1
2 NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207
3  NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053
      OTU
1 OTU.188
2 OTU.226
3 OTU.121
                                                                                                        OTU_taxonomy
1  Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified:Unclassified
2 Bacteria:Proteobacteria:Alphaproteobacteria:Rhizobiales:Phyllobacteriaceae:Mesorhizobium:Unclassified:Unclassified
3                 Bacteria:Bacteroidetes:Cytophagia:Cytophagales:Cytophagaceae:Dyadobacter:Unclassified:Unclassified



In [70]:

    
%%R
# filtering to just target taxa
df.j.t = df.j %>% 
    filter(OTU %in% df.target$OTU) 
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print

## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    #scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

Check

Are all target (overlapping) taxa the same relative abundances in both datasets?



In [71]:

    
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>% 
    filter(dataset == 'simulated') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.2 = df.j.t %>%
    filter(dataset == 'emperical') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
    mutate(BD_diff_perc = BD_range_perc.x - BD_range_perc.y)


df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x  %>% as.numeric)

## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
    facet_wrap(~ SIM_rep.x)
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p1

Correlation between relative abundance and BD_range diff

Are low abundant taxa more variable in their BD span



In [73]:

    
%%R -w 800 -h 500

ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    labs(x='Pre-fractionation relative abundance', 
         y='Difference in % of gradient spanned\n(emperical - simulated)',
         title='Overlapping taxa') +
    facet_wrap(~ SIM_rep.x) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

-- OLD --

Plotting number of taxa in each fraction

Emperical data (fullCyc)



In [15]:

    
%%R 
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'



In [16]:

    
%%R -i physeqDir -i physeq_SIP_core -i bulk_days

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' & 
                                physeq.SIP.core.m$Day %in% bulk_days, 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [17]:

    
%%R -w 800 -h 300 

## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
    as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%    
    gather(sample, abundance, 1:(ncol(df.EMP)-1)) 

df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample')) 

df.EMP.nt = df.EMP %>%
    group_by(sample) %>%
    mutate(n_taxa = sum(abundance > 0)) %>%
    ungroup() %>%
    distinct(sample) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.EMP.nt %>% head(n=3)









    



Error in eval(expr, envir, enclos) : object 'min_BD' not found






    



/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Error in eval(expr, envir, enclos) : object 'min_BD' not found

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Warning message:

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: In inner_join_impl(x, y, by$x, by$y) :
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
 
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:  joining factors with different levels, coercing to character vector

  res = super(Function, self).__call__(*new_args, **new_kwargs)

w/ simulated data



In [21]:

    
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files









    Out[21]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/OTU_abs1e9_PCR_sub.txt']



In [38]:

    
%%R -i OTU_files
# loading files

df.SIM = list()
for (x in OTU_files){
    SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
    SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
    df.SIM[[SIM_rep]] = read.delim(x, sep='\t') 
    }
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head









    





  library    fraction taxon BD_min BD_mid BD_max count  rel_abund SIM_rep
1       1  -inf-1.660 OTU.1   -Inf  1.659  1.659   204 0.01404378       7
2       1 1.660-1.661 OTU.1  1.660  1.660  1.661   376 0.01892110       7
3       1 1.661-1.665 OTU.1  1.661  1.663  1.665   169 0.01259690       7
4       1 1.665-1.669 OTU.1  1.665  1.667  1.669   150 0.01384019       7
5       1 1.669-1.672 OTU.1  1.669  1.671  1.672   323 0.01426615       7
6       1 1.672-1.675 OTU.1  1.672  1.673  1.675   249 0.01785458       7



In [40]:

    
%%R
## edit table
df.SIM.nt = df.SIM %>%
    filter(count > 0) %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(n_taxa = n()) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
df.SIM.nt %>% head









    





Source: local data frame [6 x 4]
Groups: SIM_rep, library [1]

  SIM_rep library BD_mid n_taxa
    (chr)   (int)  (dbl)  (int)
1       1       1  1.675   2128
2       1       1  1.679   1849
3       1       1  1.683   1770
4       1       1  1.688   1032
5       1       1  1.692    776
6       1       1  1.696    835



In [53]:

    
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red', alpha=0.5) +
    #geom_smooth(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

Normalized by max sample abundance



In [54]:

    
%%R -w 800 -h 300
# normalized by max number of taxa

## edit table
df.SIM.nt = df.SIM.nt %>%
    group_by(SIM_rep) %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))

df.EMP.nt = df.EMP.nt %>%
    group_by() %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))


## plot
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa_norm, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    scale_y_continuous(limits=c(0, 1)) +
    labs(x='Buoyant density', y='Number of taxa\n(fraction of max)') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

Total sequence count



In [58]:

    
%%R -w 800 -h 300

# simulated
df.SIM.s = df.SIM %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(total_abund = sum(count)) %>%
    rename('Day' = library, 'Buoyant_density' = BD_mid) %>%
    ungroup() %>%
    mutate(dataset='simulated')
# emperical
df.EMP.s = df.EMP %>% 
    group_by(Day, Buoyant_density) %>%
    summarize(total_abund = sum(abundance))  %>%
    ungroup() %>%
    mutate(dataset='emperical', SIM_rep = NA)

# join
df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.SIM.s = df.EMP.s = ""

# plot
ggplot(df.j, aes(Buoyant_density, total_abund, color=dataset, group=SIM_rep)) +
    geom_point() +
    geom_line(alpha=0.3) +
    geom_line(data=df.j %>% filter(dataset=='emperical')) +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Total sequences per sample') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Plotting Shannon diversity for each



In [59]:

    
%%R
shannon_index_long = function(df, abundance_col, ...){
    # calculating shannon diversity index from a 'long' formated table
    ## community_col = name of column defining communities
    ## abundance_col = name of column defining taxon abundances
    df = df %>% as.data.frame
    cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
    df.s = df %>%
        group_by_(...) %>%
        mutate_(REL_abundance = cmd) %>%
        mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
               shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
        ungroup() %>% 
        dplyr::select(-REL_abundance, -pi__ln_pi) %>%
        distinct_(...) 
    return(df.s)
}



In [60]:

    
%%R
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM, 'count', 'SIM_rep', 'library', 'fraction') %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)



In [62]:

    
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.shan, aes(BD_mid, shannon, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red', alpha=0.3) +
    geom_point(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    scale_y_continuous(limits=c(4, 7.5)) +
    labs(x='Buoyant density', y='Shannon index') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

min/max abundances of taxa



In [65]:

    
%%R -h 300 -w 800

# simulated
df.SIM.s = df.SIM %>% 
    filter(rel_abund > 0) %>%
    group_by(SIM_rep, BD_mid) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    rename('Buoyant_density' = BD_mid) %>%
    mutate(dataset = 'simulated')

# emperical
df.EMP.s = df.EMP %>%
    group_by(Buoyant_density) %>%
    mutate(rel_abund = abundance / sum(abundance)) %>%
    filter(rel_abund > 0) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    

# plotting
ggplot(df.j, aes(Buoyant_density, max_abund, color=dataset, group=SIM_rep)) +
    geom_point() +
    geom_line(alpha=0.3) +
    geom_line(data=df.j %>% filter(dataset=='emperical')) +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Maximum relative abundance') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

BD range where an OTU is detected

Do the simulated OTU BD distributions span the same BD range of the emperical data?

Simulated



In [67]:

    
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files









    Out[67]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/bulk-core_comm_target.txt']



In [74]:

    
%%R -i comm_files

df.comm = list()
for (f in comm_files){
    rep = gsub('.+/Day1_rep10/([0-9]+)/.+', '\\1', f)
    df.comm[[rep]] = read.delim(f, sep='\t') %>%
        dplyr::select(library, taxon_name, rel_abund_perc) %>%
        rename('bulk_abund' = rel_abund_perc) %>%
        mutate(bulk_abund = bulk_abund / 100)
}

df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)









    





  library taxon_name bulk_abund SIM_rep
1       1  OTU.14142 0.05918292       7
2       1      OTU.2 0.05856579       7
3       1  OTU.12920 0.04424833       7



In [75]:

    
%%R

## joining
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
                                         'library' = 'library',
                                         'taxon' = 'taxon_name')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)









    





  library    fraction taxon BD_min BD_mid BD_max count   rel_abund SIM_rep
1       1 1.675-1.681 OTU.1  1.675  1.678  1.681   295 0.016215028       7
2       1 1.681-1.685 OTU.1  1.681  1.683  1.685   291 0.013029462       7
3       1 1.685-1.688 OTU.1  1.685  1.687  1.688    94 0.003159664       7
  bulk_abund
1 0.02801777
2 0.02801777
3 0.02801777

Emperical



In [107]:

    
%%R
bulk_days = c(1)



In [108]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]

Emperical



In [76]:

    
%%R
bulk_days = c(1)



In [77]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [78]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [79]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    dplyr::select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [81]:

    
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD) 
    
df.EMP.j %>% head(n=3)









    





       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1130 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
3 OTU.9833 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   1                   2       23         1.69362     unknown
2   12C-Con   1                   2       23         1.69362     unknown
3   12C-Con   1                   2       23         1.69362     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150721_V4_Lib4      SIP              NA          NA               NA
2 150721_V4_Lib4      SIP              NA          NA               NA
3 150721_V4_Lib4      SIP              NA          NA               NA
  Sample_subtreatment core_dataset   bulk_abund
1                  NA         TRUE 1.234263e-04
2                  NA         TRUE 6.171316e-05
3                  NA         TRUE 0.000000e+00



In [102]:

    
%%R 
# filtering & combining emperical w/ simulated data

## emperical 
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(SIM_rep, taxon) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    rename('OTU' = taxon) %>%
    mutate(dataset = 'simulated')

## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    filter(BD_range_perc > 0,
            mean_rel_abund > 0)

df.j %>% head(n=3)









    





Source: local data frame [3 x 8]

      OTU mean_rel_abund   min_BD   max_BD   BD_range BD_range_perc   dataset
    (chr)          (dbl)    (dbl)    (dbl)      (dbl)         (dbl)     (chr)
1   OTU.1    0.028017773 1.676135 1.773391 0.09725564           100 emperical
2  OTU.10    0.001172550 1.676135 1.773391 0.09725564           100 emperical
3 OTU.100    0.001049124 1.676135 1.773391 0.09725564           100 emperical
Variables not shown: SIM_rep (chr)



In [103]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )



In [106]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=dataset)) +
    #geom_point(alpha=0.5, shape='O') +
    stat_density2d() +
    scale_x_log10() +
    scale_y_continuous() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    #geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

BD span of just overlapping taxa

Taxa overlapping between emperical data and genomes in dataset
These taxa should have the same relative abundances in both datasets.
- The comm file was created from the emperical dataset phyloseq file.



In [119]:

    
%%R -i targetFile

df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)









    





[1] 194
  cluster
1     212
2     762
3     663
                                                                                                   ssu_ID
1          rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-
2 rRNA_NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207_1746046-1747518_DIR+
3  rRNA_NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053_4003859-4005353_DIR-
                                                                                         genome_fileID
1      /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Acinetobacter_oleivorans_DR1.fna
2 /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Mesorhizobium_australicum_WSM2073.fna
3  /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Dyadobacter_fermentans_DSM_18053.fna
                           genomeID
1      Acinetobacter_oleivorans_DR1
2 Mesorhizobium_australicum_WSM2073
3  Dyadobacter_fermentans_DSM_18053
                                                                   genome_seqID
1          NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1
2 NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207
3  NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053
      OTU
1 OTU.188
2 OTU.226
3 OTU.121
                                                                                                        OTU_taxonomy
1  Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified:Unclassified
2 Bacteria:Proteobacteria:Alphaproteobacteria:Rhizobiales:Phyllobacteriaceae:Mesorhizobium:Unclassified:Unclassified
3                 Bacteria:Bacteroidetes:Cytophagia:Cytophagales:Cytophagaceae:Dyadobacter:Unclassified:Unclassified



In [145]:

    
%%R
# filtering to just target taxa
df.j.t = df.j %>% 
    filter(OTU %in% df.target$OTU) 
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print

## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    #scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

Check

Are all target (overlapping) taxa the same relative abundances in both datasets?



In [150]:

    
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>% 
    filter(dataset == 'simulated') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.2 = df.j.t %>%
    filter(dataset == 'emperical') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
    mutate(BD_diff_perc = BD_range_perc.x - BD_range_perc.y)


df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x  %>% as.numeric)

## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
    facet_wrap(~ SIM_rep.x)
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p1

Correlation between relative abundance and BD_range diff

Are low abundant taxa more variable in their BD span



In [151]:

    
%%R -w 800 -h 500

ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    labs(x='Relative abundance', 
         y='Difference in % of gradient spanned\n(simulated vs emperical)',
         title='Overlapping taxa') +
    facet_wrap(~ SIM_rep.x) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )



In [ ]:

Plotting abundance distributions



In [113]:

    
%%R -w 800 -h 400

plot_abund_dists = function(df.EMP.j, df.SIM.j, sim_rep){
    ## emperical 
    df.EMP.j.f = df.EMP.j %>%
        filter(abundance > 0) %>%
        dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
        mutate(dataset = 'emperical') 
    
    ## simulated
    df.SIM.j.f = df.SIM.j %>%
        filter(SIM_rep == sim_rep) %>%
        filter(count > 0) %>%
        dplyr::select(taxon, fraction, count, BD_mid, bulk_abund) %>%
        rename('OTU' = taxon,
               'sample' = fraction,
               'Buoyant_density' = BD_mid,
               'abundance' = count) %>%
        mutate(dataset = 'simulated') 
    
    df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
        group_by(sample) %>%
        mutate(rel_abund = abundance / sum(abundance))
    
    title = paste0('Simulation Replicate ', sim_rep)
    
    p = ggplot(df.j, aes(Buoyant_density, abundance, fill=OTU)) +
        geom_area(stat='identity', position='dodge', alpha=0.5) +
        labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)', title=title) +
        facet_grid(dataset ~ .) +
        theme_bw() +
        theme( 
            text = element_text(size=16),
            legend.position = 'none',
              axis.title.y = element_text(vjust=1),        
              axis.title.x = element_blank(),
              plot.margin=unit(c(0.1,1,0.1,1), "cm")
             )
    return(p)
    }

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=1)



In [114]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=2)



In [115]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=3)



In [116]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=4)



In [117]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=5)



In [118]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=6)



In [132]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=10)

Notes

The lack of full-BD span for some simulation replicates is due to certain low/high GC taxa being very abundant