Goal

Simulating fullCyc Day1 control gradients
- Not simulating incorporation (all 0% isotope incorp.)
  - Don't know how much true incorporatation for emperical data
Richness = genome reference pool size (n=1147)
Simulating 10 replicate gradients to assess simulation stochasticity.

Init



In [1]:

    
import os
import glob
import re
import nestly



In [2]:

    
%load_ext rpy2.ipython
%load_ext pushnote



In [3]:

    
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)

## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66









    



/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Need help? Try the ggplot2 mailing list:
http://groups.google.com/group/ggplot2.

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

Nestly

assuming fragments already simulated



In [4]:

    
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'Day1_rep10')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile= '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags.pkl'
targetFile = '/home/nick/notebook/SIPSim/dev/fullCyc/CD-HIT/target_taxa.txt'

physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'

nreps = 10
prefrac_comm_abundance = '1e9'

seq_per_fraction = ['lognormal', 9.432, 0.5, 10000, 30000] # dist, mean, scale, min, max
bulk_days = [1]
nprocs = 16



In [5]:

    
# building tree structure
nest = nestly.Nest()

    ## varying params
nest.add('rep', [x + 1 for x in xrange(nreps)])

## set params
nest.add('bulk_day', bulk_days, create_dir=False)
nest.add('abs', [prefrac_comm_abundance], create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [nprocs], create_dir=False)
nest.add('subsample_dist', [seq_per_fraction[0]], create_dir=False)
nest.add('subsample_mean', [seq_per_fraction[1]], create_dir=False)
nest.add('subsample_scale', [seq_per_fraction[2]], create_dir=False)
nest.add('subsample_min', [seq_per_fraction[3]], create_dir=False)
nest.add('subsample_max', [seq_per_fraction[4]], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('targetFile', [targetFile], create_dir=False)
nest.add('physeqDir', [physeqDir], create_dir=False)
nest.add('physeq_bulkCore', [physeq_bulkCore], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')



In [6]:

    
%%writefile $bashFile
#!/bin/bash

export PATH={R_dir}:$PATH

#-- making DNA pool similar to gradient of interest
echo '# Creating comm file from phyloseq'
phyloseq2comm.r {physeqDir}{physeq_bulkCore} -s 12C-Con -d {bulk_day} > {physeq_bulkCore}_comm.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm.txt

echo '## Adding target taxa to comm file'
comm_add_target.r {physeq_bulkCore}_comm.txt {targetFile} > {physeq_bulkCore}_comm_target.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm_target.txt

echo '## parsing out genome fragments to make simulated DNA pool resembling the gradient of interest'
## all OTUs without an associated reference genome will be assigned a random reference (of the reference genome pool)
### this is done through --NA-random
SIPSim fragment_KDE_parse {fragFile} {physeq_bulkCore}_comm_target.txt \
    --rename taxon_name --NA-random > fragsParsed.pkl


echo '#-- SIPSim pipeline --#'
echo '# converting fragments to KDE'
SIPSim fragment_KDE \
    fragsParsed.pkl \
    > fragsParsed_KDE.pkl
    
echo '# adding diffusion'
SIPSim diffusion \
    fragsParsed_KDE.pkl \
    --np {np} \
    > fragsParsed_KDE_dif.pkl    

echo '# adding DBL contamination'
SIPSim DBL \
    fragsParsed_KDE_dif.pkl \
    --np {np} \
    > fragsParsed_KDE_dif_DBL.pkl      
    
echo '# making incorp file'
SIPSim incorpConfigExample \
  --percTaxa {percTaxa} \
  --percIncorpUnif {percIncorp} \
  > {percTaxa}_{percIncorp}.config

echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    fragsParsed_KDE_dif_DBL.pkl \
    {percTaxa}_{percIncorp}.config \
    --comm {physeq_bulkCore}_comm_target.txt \
    --np {np} \
    > fragsParsed_KDE_dif_DBL_inc.pkl
    
#echo '# calculating BD shift from isotope incorporation'
#SIPSim BD_shift \
#    fragsParsed_KDE_dif_DBL.pkl \
#    fragsParsed_KDE_dif_DBL_inc.pkl \
#    --np {np} \
#    > fragsParsed_KDE_dif_DBL_inc_BD-shift.txt

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    {physeq_bulkCore}_comm_target.txt \
    > fracs.txt 

echo '# simulating an OTU table'
SIPSim OTU_table \
    fragsParsed_KDE_dif_DBL_inc.pkl \
    {physeq_bulkCore}_comm_target.txt \
    fracs.txt \
    --abs {abs} \
    --np {np} \
    > OTU_abs{abs}.txt
    
echo '# simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt









    



Writing /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/SIPSimRun.sh



In [ ]:

    
!chmod 777 $bashFile
!cd $workDir; \
    nestrun  --template-file $bashFile -d Day1_rep10 --log-file log.txt -j 1









    



2016-02-04 13:55:17,476 * INFO * Template: ./SIPSimRun.sh
2016-02-04 13:55:17,479 * INFO * [161975] Started ./SIPSimRun.sh in Day1_rep10/7



In [ ]:

    
%pushnote Day1_rep10 complete

BD min/max

what is the min/max BD that we care about?



In [11]:

    
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')









    





Min BD: 1.67323 
Max BD: 1.7744

Loading data

Emperical

SIP data



In [62]:

    
%%R 
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'



In [26]:

    
%%R -i physeqDir -i physeq_SIP_core -i bulk_days

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' & 
                                physeq.SIP.core.m$Day %in% bulk_days, 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [64]:

    
%%R 
## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
    as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%    
    gather(sample, abundance, 1:(ncol(df.EMP)-1)) 

df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample')) 

df.EMP.nt = df.EMP %>%
    group_by(sample) %>%
    mutate(n_taxa = sum(abundance > 0)) %>%
    ungroup() %>%
    distinct(sample) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.EMP.nt %>% head(n=3)









    





Source: local data frame [3 x 20]

       OTU            sample abundance primer_number fwd_barcode rev_barcode
     (chr)             (chr)     (dbl)         (int)      (fctr)      (fctr)
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1101 12C-Con.D1.R2_F18         0           129    CTACTATA    TGAGTACG
3 OTU.1101 12C-Con.D1.R2_F20         1           131    AGAGTCAC    TGAGTACG
Variables not shown: Substrate (fctr), Day (int), Microcosm_replicate (int),
  Fraction (int), Buoyant_density (dbl), Sample_type (fctr), library (fctr),
  Exp_type (fctr), Sample_location (lgl), Sample_date (lgl), Sample_treatment
  (lgl), Sample_subtreatment (lgl), core_dataset (lgl), n_taxa (int)

bulk soil samples



In [27]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [28]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [29]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    dplyr::select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [30]:

    
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD) 
    
df.EMP.j %>% head(n=3)









    





       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1130 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
3 OTU.9833 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   1                   2       23         1.69362     unknown
2   12C-Con   1                   2       23         1.69362     unknown
3   12C-Con   1                   2       23         1.69362     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150721_V4_Lib4      SIP              NA          NA               NA
2 150721_V4_Lib4      SIP              NA          NA               NA
3 150721_V4_Lib4      SIP              NA          NA               NA
  Sample_subtreatment core_dataset   bulk_abund
1                  NA         TRUE 1.234263e-04
2                  NA         TRUE 6.171316e-05
3                  NA         TRUE 0.000000e+00

Simulated



In [31]:

    
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files









    Out[31]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/OTU_abs1e9_PCR_sub.txt']



In [15]:

    
%%R -i OTU_files
# loading files

df.SIM = list()
for (x in OTU_files){
    SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
    SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
    df.SIM[[SIM_rep]] = read.delim(x, sep='\t') 
    }
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head









    





  library    fraction taxon BD_min BD_mid BD_max count  rel_abund SIM_rep
1       1  -inf-1.660 OTU.1   -Inf  1.659  1.659   377 0.01748041       7
2       1 1.660-1.663 OTU.1  1.660  1.661  1.663   175 0.01552657       7
3       1 1.663-1.666 OTU.1  1.663  1.664  1.666   154 0.01506112       7
4       1 1.666-1.668 OTU.1  1.666  1.667  1.668   251 0.01859673       7
5       1 1.668-1.673 OTU.1  1.668  1.671  1.673   231 0.02040816       7
6       1 1.673-1.678 OTU.1  1.673  1.675  1.678   137 0.01364542       7



In [16]:

    
%%R
## edit table
df.SIM.nt = df.SIM %>%
    filter(count > 0) %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(n_taxa = n()) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
df.SIM.nt %>% head









    





Source: local data frame [6 x 4]
Groups: SIM_rep, library [1]

  SIM_rep library BD_mid n_taxa
    (chr)   (int)  (dbl)  (int)
1       1       1  1.677   1632
2       1       1  1.681   1631
3       1       1  1.684   1531
4       1       1  1.687   1129
5       1       1  1.690    604
6       1       1  1.693    532

'bulk soil' community files



In [32]:

    
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files









    Out[32]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/bulk-core_comm_target.txt']



In [33]:

    
%%R -i comm_files

df.comm = list()
for (f in comm_files){
    rep = gsub('.+/Day1_rep10/([0-9]+)/.+', '\\1', f)
    df.comm[[rep]] = read.delim(f, sep='\t') %>%
        dplyr::select(library, taxon_name, rel_abund_perc) %>%
        rename('bulk_abund' = rel_abund_perc) %>%
        mutate(bulk_abund = bulk_abund / 100)
}

df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)









    





  library taxon_name bulk_abund SIM_rep
1       1  OTU.14142 0.05918292       7
2       1      OTU.2 0.05856579       7
3       1  OTU.12920 0.04424833       7



In [34]:

    
%%R
## joining tables
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
                                         'library' = 'library',
                                         'taxon' = 'taxon_name')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)









    





  library    fraction taxon BD_min BD_mid BD_max count  rel_abund SIM_rep
1       1 1.673-1.678 OTU.1  1.673  1.675  1.678   137 0.01364542       7
2       1 1.678-1.682 OTU.1  1.678  1.680  1.682   417 0.01615152       7
3       1 1.682-1.687 OTU.1  1.682  1.684  1.687   125 0.01049626       7
  bulk_abund
1 0.02801777
2 0.02801777
3 0.02801777



In [49]:

    
%%R 
# filtering & combining emperical w/ simulated data

## emperical 
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(SIM_rep, taxon) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    rename('OTU' = taxon) %>%
    mutate(dataset = 'simulated')

## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    filter(BD_range_perc > 0,
            mean_rel_abund > 0)

df.j$SIM_rep = reorder(df.j$SIM_rep, df.j$SIM_rep %>% as.numeric)

df.j %>% head(n=3)









    





Source: local data frame [3 x 8]

      OTU mean_rel_abund   min_BD   max_BD   BD_range BD_range_perc   dataset
    (chr)          (dbl)    (dbl)    (dbl)      (dbl)         (dbl)     (chr)
1   OTU.1    0.028017773 1.676135 1.773391 0.09725564           100 emperical
2  OTU.10    0.001172550 1.676135 1.773391 0.09725564           100 emperical
3 OTU.100    0.001049124 1.676135 1.773391 0.09725564           100 emperical
Variables not shown: SIM_rep (fctr)



In [50]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.3) +
    scale_x_log10() +
    scale_y_continuous() +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

BD span of just overlapping taxa

Taxa overlapping between emperical data and genomes in dataset
These taxa should have the same relative abundances in both datasets.
- The comm file was created from the emperical dataset phyloseq file.



In [54]:

    
%%R -i targetFile

df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)









    





[1] 194
  cluster
1     212
2     762
3     663
                                                                                                   ssu_ID
1          rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-
2 rRNA_NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207_1746046-1747518_DIR+
3  rRNA_NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053_4003859-4005353_DIR-
                                                                                         genome_fileID
1      /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Acinetobacter_oleivorans_DR1.fna
2 /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Mesorhizobium_australicum_WSM2073.fna
3  /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Dyadobacter_fermentans_DSM_18053.fna
                           genomeID
1      Acinetobacter_oleivorans_DR1
2 Mesorhizobium_australicum_WSM2073
3  Dyadobacter_fermentans_DSM_18053
                                                                   genome_seqID
1          NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1
2 NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207
3  NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053
      OTU
1 OTU.188
2 OTU.226
3 OTU.121
                                                                                                        OTU_taxonomy
1  Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified:Unclassified
2 Bacteria:Proteobacteria:Alphaproteobacteria:Rhizobiales:Phyllobacteriaceae:Mesorhizobium:Unclassified:Unclassified
3                 Bacteria:Bacteroidetes:Cytophagia:Cytophagales:Cytophagaceae:Dyadobacter:Unclassified:Unclassified



In [55]:

    
%%R
# filtering to just target taxa
df.j.t = df.j %>% 
    filter(OTU %in% df.target$OTU) 
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print

## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    #scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

Check

Are all target (overlapping) taxa the same relative abundances in both datasets?



In [58]:

    
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>% 
    filter(dataset == 'simulated') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.2 = df.j.t %>%
    filter(dataset == 'emperical') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
    mutate(BD_diff_perc = BD_range_perc.y - BD_range_perc.x)


df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x  %>% as.numeric)

## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
    facet_wrap(~ SIM_rep.x)
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p1

Correlation between relative abundance and BD_range diff

Are low abundant taxa more variable in their BD span



In [66]:

    
%%R -w 800 -h 500

ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    labs(x='Pre-fractionation relative abundance', 
         y='Difference in % of gradient spanned\n(emperical - simulated)',
         title='Overlapping taxa') +
    facet_wrap(~ SIM_rep.x) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

Notes

between Day1_rep10, Day1_richFromTarget_rep10, and Day1_add_Rich_rep10:
- Day1_rep10 has the most accurate representation of BD span (% of gradient spanned by taxa).
  - Accuracy drops at ~1e-3 to ~5e-4, but this is caused by detection limits (veil-line effect).

Comparing abundance distributions of overlapping taxa



In [338]:

    
%%R

join_abund_dists = function(df.EMP.j, df.SIM.j, df.target){
    ## emperical 
    df.EMP.j.f = df.EMP.j %>%
        filter(abundance > 0) %>%
        dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
        mutate(dataset = 'emperical', SIM_rep = NA) %>%
        filter(OTU %in% df.target$OTU) 
    
    ## simulated
    df.SIM.j.f = df.SIM.j %>%
        filter(count > 0) %>%
        dplyr::select(taxon, fraction, count, BD_mid, bulk_abund, SIM_rep) %>%
        rename('OTU' = taxon,
               'sample' = fraction,
               'Buoyant_density' = BD_mid,
               'abundance' = count) %>%
        mutate(dataset = 'simulated') %>%
        filter(OTU %in% df.target$OTU) 
    
    ## getting just intersecting OTUs
    OTUs.int = intersect(df.EMP.j.f$OTU, df.SIM.j.f$OTU)
    
    df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
        filter(OTU %in% OTUs.int) %>%
        group_by(sample) %>%
        mutate(rel_abund = abundance / sum(abundance))
    
    cat('Number of overlapping OTUs between emperical & simulated:', 
            df.j$OTU %>% unique %>% length, '\n\n')
    return(df.j)
    }


df.j = join_abund_dists(df.EMP.j, df.SIM.j, df.target)
df.j %>% head(n=3) %>% as.data.frame









    





Number of overlapping OTUs between emperical & simulated: 92 

      OTU            sample abundance Buoyant_density   bulk_abund   dataset
1 OTU.170 12C-Con.D1.R2_F23         2         1.69362 0.0005554184 emperical
2  OTU.24 12C-Con.D1.R2_F23        25         1.69362 0.0020982473 emperical
3  OTU.11 12C-Con.D1.R2_F23         4         1.69362 0.0004319921 emperical
  SIM_rep    rel_abund
1    <NA> 0.0006006006
2    <NA> 0.0075075075
3    <NA> 0.0012012012



In [339]:

    
%%R
# closure operation
df.j = df.j %>%
    ungroup() %>%
    mutate(SIM_rep = SIM_rep %>% as.numeric) %>%
    group_by(dataset, SIM_rep, sample) %>%
    mutate(rel_abund_c = rel_abund / sum(rel_abund)) %>%
    ungroup()

df.j %>% head(n=3) %>% as.data.frame









    





      OTU            sample abundance Buoyant_density   bulk_abund   dataset
1 OTU.170 12C-Con.D1.R2_F23         2         1.69362 0.0005554184 emperical
2  OTU.24 12C-Con.D1.R2_F23        25         1.69362 0.0020982473 emperical
3  OTU.11 12C-Con.D1.R2_F23         4         1.69362 0.0004319921 emperical
  SIM_rep    rel_abund  rel_abund_c
1      NA 0.0006006006 0.0006006006
2      NA 0.0075075075 0.0075075075
3      NA 0.0012012012 0.0012012012



In [340]:

    
%%R -h 1500 -w 800
# plotting 
plot_abunds = function(df){
    p = ggplot(df, aes(Buoyant_density, rel_abund_c, fill=OTU)) +
        geom_area(stat='identity', position='dodge', alpha=0.5) +
        labs(x='Buoyant density', 
             y='Subsampled community\n(relative abundance for subset taxa)') +
        theme_bw() +
        theme( 
            text = element_text(size=16),
            legend.position = 'none',
              axis.title.y = element_text(vjust=1),        
              axis.title.x = element_blank(),
              plot.margin=unit(c(0.1,1,0.1,1), "cm")
             )
    return(p)
    }


# simulations
df.j.f = df.j %>%
    filter(dataset == 'simulated')
p.SIM = plot_abunds(df.j.f)
p.SIM = p.SIM + facet_grid(SIM_rep ~ .)

# emperical
df.j.f = df.j %>%
    filter(dataset == 'emperical')
p.EMP = plot_abunds(df.j.f)

# make figure
grid.arrange(p.EMP, p.SIM, ncol=1, heights=c(1,5))

Check: plotting closure abs-abunds of overlapping taxa

The overlapping taxa should have the same closure-transformed relative abundances for both:
- absolute abundances (OTU table)
- relative abundances (subsampled OTU table; as above)

Loading OTU table (abs abunds)



In [341]:

    
OTU_files = !find $buildDir -name "OTU_abs1e9.txt"
OTU_files









    Out[341]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/OTU_abs1e9.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/OTU_abs1e9.txt']



In [342]:

    
%%R -i OTU_files
# loading files

df.SIM.abs = list()
for (x in OTU_files){
    SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
    SIM_rep = gsub('/OTU_abs1e9.txt', '', SIM_rep)
    df.SIM.abs[[SIM_rep]] = read.delim(x, sep='\t') 
    }
df.SIM.abs = do.call('rbind', df.SIM.abs)
df.SIM.abs$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM.abs))
rownames(df.SIM.abs) = 1:nrow(df.SIM.abs)
df.SIM.abs %>% head









    





  library taxon    fraction BD_min BD_mid BD_max count  rel_abund SIM_rep
1       1 OTU.1  -inf-1.660   -Inf  1.659  1.659  8384 0.02576038       7
2       1 OTU.1 1.660-1.663  1.660  1.661  1.663  2207 0.02087215       7
3       1 OTU.1 1.663-1.666  1.663  1.664  1.666  2130 0.02214114       7
4       1 OTU.1 1.666-1.668  1.666  1.667  1.668  2272 0.03469762       7
5       1 OTU.1 1.668-1.673  1.668  1.671  1.673  5888 0.03673250       7
6       1 OTU.1 1.673-1.678  1.673  1.675  1.678  3972 0.02432825       7



In [343]:

    
%%R
# subset just overlapping taxa
# & closure operation
df.SIM.abs.t = df.SIM.abs %>%
    filter(taxon %in% df.target$OTU) %>%
    group_by(SIM_rep, fraction) %>%
    mutate(rel_abund_c = count / sum(count)) %>%
    rename('Buoyant_density' = BD_mid,
           'OTU' = taxon)

df.SIM.abs.t %>% head(n=3) %>% as.data.frame









    





  library   OTU    fraction BD_min Buoyant_density BD_max count  rel_abund
1       1 OTU.1  -inf-1.660   -Inf           1.659  1.659  8384 0.02576038
2       1 OTU.1 1.660-1.663  1.660           1.661  1.663  2207 0.02087215
3       1 OTU.1 1.663-1.666  1.663           1.664  1.666  2130 0.02214114
  SIM_rep rel_abund_c
1       7  0.11692840
2       7  0.10000000
3       7  0.09375825



In [344]:

    
%%R -w 800 -h 1200
# plotting
p.abs = plot_abunds(df.SIM.abs.t) 
p.abs + facet_grid(SIM_rep ~ .)

Notes

The abundance distributions of the overlapping OTUs look pretty similar between 'absolute' and 'relative' (post-PCR & post-sequencing simulation).
- The difference between absolute and relative are probably due to the PCR simulation

Calculating center of mass for overlapping taxa

weighted mean BD, where weights are relative abundances



In [351]:

    
%%R

center_mass = function(df){
    df = df %>%
        group_by(dataset, SIM_rep, OTU) %>%
        summarize(center_mass = weighted.mean(Buoyant_density, rel_abund_c, na.rm=T)) %>%
        ungroup()
    return(df)
}

df.j.cm = center_mass(df.j)



In [353]:

    
%%R
# getting mean cm for all SIM_reps
df.j.cm.s = df.j.cm %>%
    group_by(dataset, OTU) %>%
    summarize(mean_cm = mean(center_mass, na.rm=T),
              stdev_cm = sd(center_mass)) %>%
    ungroup() %>%
    spread(dataset, mean_cm) %>%
    group_by(OTU) %>%
    summarize(stdev_cm = mean(stdev_cm, na.rm=T),
              emperical = mean(emperical, na.rm=T),
              simulated = mean(simulated, na.rm=T)) %>%
    ungroup()

# check
cat('Number of OTUs:', df.j.cm.s$OTU %>% unique %>% length, '\n')

# plotting
ggplot(df.j.cm.s, aes(emperical, simulated,
                    ymin = simulated - stdev_cm,
                    ymax = simulated + stdev_cm)) +
    geom_pointrange() +
    stat_function(fun = function(x) x, linetype='dashed', alpha=0.5, color='red') +
    scale_x_continuous(limits=c(1.69, 1.74)) +
    scale_y_continuous(limits=c(1.7, 1.75)) +
    labs(title='Center of mass') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )









    





Number of OTUs: 92

Notes

Error bars are stdev of simulation reps
The center of mass for most OTUs is shifted heavier in the simulations vs the emperical data
Also, most have a mean simulated BD of ~1.73
- This is approximately the middle of the BD range (1.725)
- It suggests that many of the taxa are equally dispersed across the gradient



In [383]:

    
%%R
BD_MIN = df.j$Buoyant_density %>% min 
BD_MAX = df.j$Buoyant_density %>% max
BD_AVE = mean(c(BD_MIN, BD_MAX))
print(c(BD_MIN, BD_AVE, BD_MAX))









    





[1] 1.6750 1.7245 1.7740

R^2 for each SIM_rep



In [354]:

    
%%R
# formatting table
df.j.cm.j = inner_join(df.j.cm %>% 
                           filter(dataset == 'simulated') %>%
                           rename('cm_SIM' = center_mass),
                       df.j.cm %>% 
                           filter(dataset == 'emperical') %>%
                           rename('cm_EMP' = center_mass),
                       c('OTU' = 'OTU')) %>%
    select(-starts_with('dataset'))

df.j.cm.j %>% head









    





Source: local data frame [6 x 5]

  SIM_rep.x      OTU   cm_SIM SIM_rep.y   cm_EMP
      (dbl)    (chr)    (dbl)     (dbl)    (dbl)
1         1    OTU.1 1.721445        NA 1.730945
2         1 OTU.1000 1.728778        NA 1.729528
3         1 OTU.1022 1.716803        NA 1.720226
4         1  OTU.106 1.725988        NA 1.712178
5         1   OTU.11 1.726025        NA 1.730503
6         1  OTU.113 1.724192        NA 1.715515



In [355]:

    
%%R -w 300 -h 400
# lm()
df.j.cm.j.lm = df.j.cm.j %>%
    group_by(SIM_rep.x) %>%
    do(fit = lm(cm_EMP ~ cm_SIM, data = .)) %>%
    mutate(R2 =  summary(fit)$coeff[2],
           data = 'simulated')

#df.j.cm.j.lm %>% head

# plotting
ggplot(df.j.cm.j.lm, aes(data, R2)) +
    geom_boxplot() +
    geom_jitter(height=0, width=0.2, color='red') +
    labs(y='R^2', title='simulated ~ emperical') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

Notes:

Pretty low R^2, with 1 simulation rep at almost 0

Plotting abundances of some outlier taxa

why is simulated so different in center of mass vs emperical?



In [384]:

    
%%R -h 1100 -w 800

# cutoff on which OTU are major outliers (varying between simulated and emperical)
BD.diff.cut = 0.015

# which OTU to plot?
df.j.cm.s.f = df.j.cm.s %>% 
    mutate(cm_diff = abs(emperical - simulated)) %>%
    filter(cm_diff > BD.diff.cut, ! is.na(simulated)) 

print('OTUs:')
print(df.j.cm.s.f$OTU)

# filtering to just target taxon
## Simulated
df.j.f = df.j %>%
    filter(dataset == 'simulated', 
           OTU %in% df.j.cm.s.f$OTU) 

p.SIM = plot_abunds(df.j.f)
p.SIM = p.SIM + facet_grid(SIM_rep ~ .)

## Emperical
df.j.f = df.j %>%
    filter(dataset == 'emperical', 
           OTU %in% df.j.cm.s.f$OTU)
p.EMP = plot_abunds(df.j.f)

# make figure
grid.arrange(p.EMP, p.SIM, ncol=1, heights=c(1,5))









    





[1] "OTUs:"
 [1] "OTU.106"  "OTU.113"  "OTU.119"  "OTU.121"  "OTU.146"  "OTU.149" 
 [7] "OTU.15"   "OTU.18"   "OTU.224"  "OTU.226"  "OTU.264"  "OTU.2746"
[13] "OTU.2978" "OTU.31"   "OTU.32"   "OTU.3380" "OTU.3382" "OTU.34"  
[19] "OTU.3492" "OTU.36"   "OTU.382"  "OTU.394"  "OTU.403"  "OTU.4057"
[25] "OTU.44"   "OTU.462"  "OTU.649"  "OTU.717"  "OTU.75"   "OTU.8"   
[31] "OTU.84"   "OTU.87"   "OTU.9"

Notes

The center of mass seems to be correct: most taxa are shifted heavier relative to the emperical data

Check: which taxon is the highly abundant one?

why is it shifted so far to 'heavy'
the abundance distribution mode is ~1.73, which is a G+C of ~0.71

Abundant taxon: OTU.32

rep genome: Pseudonocardia_dioxanivorans_CB1190.fna
- genome GC = 73.31
- do the G+C contents of this amplicon region vary among taxa in the genus?
- No, genome G+C for all 7 genomes in ncbi are ~70

Check: plotting 'absolute' abudance distributions for major CM outliers



In [367]:

    
%%R
# subset outliers
df.SIM.abs.t = df.SIM.abs %>%
    filter(taxon %in% df.target$OTU) %>%
    group_by(SIM_rep, fraction) %>%
    mutate(rel_abund_c = count / sum(count)) %>%
    rename('Buoyant_density' = BD_mid,
           'OTU' = taxon) %>%
    filter(OTU %in% df.j.cm.s.f$OTU) 

df.SIM.abs.t %>% head(n=3) %>% as.data.frame









    





  library     OTU    fraction BD_min Buoyant_density BD_max count    rel_abund
1       1 OTU.106  -inf-1.660   -Inf           1.659  1.659    20 0.0000614513
2       1 OTU.106 1.660-1.663  1.660           1.661  1.663    62 0.0005863494
3       1 OTU.106 1.663-1.666  1.663           1.664  1.666    35 0.0003638216
  SIM_rep  rel_abund_c
1       7 0.0002789322
2       7 0.0028092433
3       7 0.0015406286



In [368]:

    
%%R -w 800 -h 1200
# plotting
p.abs = plot_abunds(df.SIM.abs.t) 
p.abs + facet_grid(SIM_rep ~ .)

Notes:

simulated absolute abundances seem similar to the simulated relative abundances: both shifted heavy vs emperical

What genomes are these outliers?



In [373]:

    
%%R -w 800
genomes = df.target %>% 
    filter(OTU %in% df.SIM.abs.t$OTU) 

df.genInfo = read.delim('/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_info.txt')
df.genInfo.f = df.genInfo %>% 
    filter(seq_name %in% genomes$genome_seqID) %>%
    mutate(genome_ID = gsub('\\.fna', '', file_name))

df.genInfo.f$genome_ID = reorder(df.genInfo.f$genome_ID, df.genInfo.f$total_GC)

# plotting
ggplot(df.genInfo.f, aes(genome_ID, total_GC)) +
    geom_point() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=50, hjust=1)
    )

Notes

Odd... these outlier OTUs are simulated from genomes with high G+C.
- I would have expected the OTUs to be shifted right in the emperical data.
- Are the simulations over-shifting the distributions of heavy G+C taxa?

Plotting rep genome G+C of all overlapping taxa



In [370]:

    
%%R -w 800

df.genInfo = read.delim('/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_info.txt')
df.genInfo.f = df.genInfo %>% 
    filter(seq_name %in% df.target$genome_seqID) %>%
    mutate(genome_ID = gsub('\\.fna', '', file_name))

df.genInfo.f$genome_ID = reorder(df.genInfo.f$genome_ID, df.genInfo.f$total_GC)

# plotting
ggplot(df.genInfo.f, aes(genome_ID, total_GC)) +
    geom_point() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_blank()
    )

-- OLD --

Plotting number of taxa in each fraction

Emperical data (fullCyc)



In [18]:

    
%%R 
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'



In [19]:

    
%%R -i physeqDir -i physeq_SIP_core -i bulk_days

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' & 
                                physeq.SIP.core.m$Day %in% bulk_days, 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [20]:

    
%%R -w 800 -h 300 

## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
    as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%    
    gather(sample, abundance, 1:(ncol(df.EMP)-1)) 

df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample')) 

df.EMP.nt = df.EMP %>%
    group_by(sample) %>%
    mutate(n_taxa = sum(abundance > 0)) %>%
    ungroup() %>%
    distinct(sample) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    

## plotting
p = ggplot(df.EMP.nt, aes(Buoyant_density, n_taxa)) +
    geom_point(color='blue') +
    geom_line(color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

w/ simulated data



In [21]:

    
OTU_files = !find $buildDir -name "OTU_abs1e9_PCR_sub.txt"
OTU_files









    Out[21]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/OTU_abs1e9_PCR_sub.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/OTU_abs1e9_PCR_sub.txt']



In [38]:

    
%%R -i OTU_files
# loading files

df.SIM = list()
for (x in OTU_files){
    SIM_rep = gsub('/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/', '', x)
    SIM_rep = gsub('/OTU_abs1e9_PCR_sub.txt', '', SIM_rep)
    df.SIM[[SIM_rep]] = read.delim(x, sep='\t') 
    }
df.SIM = do.call('rbind', df.SIM)
df.SIM$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.SIM))
rownames(df.SIM) = 1:nrow(df.SIM)
df.SIM %>% head









    





  library    fraction taxon BD_min BD_mid BD_max count  rel_abund SIM_rep
1       1  -inf-1.660 OTU.1   -Inf  1.659  1.659   204 0.01404378       7
2       1 1.660-1.661 OTU.1  1.660  1.660  1.661   376 0.01892110       7
3       1 1.661-1.665 OTU.1  1.661  1.663  1.665   169 0.01259690       7
4       1 1.665-1.669 OTU.1  1.665  1.667  1.669   150 0.01384019       7
5       1 1.669-1.672 OTU.1  1.669  1.671  1.672   323 0.01426615       7
6       1 1.672-1.675 OTU.1  1.672  1.673  1.675   249 0.01785458       7



In [40]:

    
%%R
## edit table
df.SIM.nt = df.SIM %>%
    filter(count > 0) %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(n_taxa = n()) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
df.SIM.nt %>% head









    





Source: local data frame [6 x 4]
Groups: SIM_rep, library [1]

  SIM_rep library BD_mid n_taxa
    (chr)   (int)  (dbl)  (int)
1       1       1  1.675   2128
2       1       1  1.679   1849
3       1       1  1.683   1770
4       1       1  1.688   1032
5       1       1  1.692    776
6       1       1  1.696    835



In [53]:

    
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red', alpha=0.5) +
    #geom_smooth(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

Normalized by max sample abundance



In [54]:

    
%%R -w 800 -h 300
# normalized by max number of taxa

## edit table
df.SIM.nt = df.SIM.nt %>%
    group_by(SIM_rep) %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))

df.EMP.nt = df.EMP.nt %>%
    group_by() %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))


## plot
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa_norm, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    scale_y_continuous(limits=c(0, 1)) +
    labs(x='Buoyant density', y='Number of taxa\n(fraction of max)') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

Total sequence count



In [58]:

    
%%R -w 800 -h 300

# simulated
df.SIM.s = df.SIM %>%
    group_by(SIM_rep, library, BD_mid) %>%
    summarize(total_abund = sum(count)) %>%
    rename('Day' = library, 'Buoyant_density' = BD_mid) %>%
    ungroup() %>%
    mutate(dataset='simulated')
# emperical
df.EMP.s = df.EMP %>% 
    group_by(Day, Buoyant_density) %>%
    summarize(total_abund = sum(abundance))  %>%
    ungroup() %>%
    mutate(dataset='emperical', SIM_rep = NA)

# join
df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.SIM.s = df.EMP.s = ""

# plot
ggplot(df.j, aes(Buoyant_density, total_abund, color=dataset, group=SIM_rep)) +
    geom_point() +
    geom_line(alpha=0.3) +
    geom_line(data=df.j %>% filter(dataset=='emperical')) +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Total sequences per sample') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Plotting Shannon diversity for each



In [59]:

    
%%R
shannon_index_long = function(df, abundance_col, ...){
    # calculating shannon diversity index from a 'long' formated table
    ## community_col = name of column defining communities
    ## abundance_col = name of column defining taxon abundances
    df = df %>% as.data.frame
    cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
    df.s = df %>%
        group_by_(...) %>%
        mutate_(REL_abundance = cmd) %>%
        mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
               shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
        ungroup() %>% 
        dplyr::select(-REL_abundance, -pi__ln_pi) %>%
        distinct_(...) 
    return(df.s)
}



In [60]:

    
%%R
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM, 'count', 'SIM_rep', 'library', 'fraction') %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)



In [62]:

    
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.shan, aes(BD_mid, shannon, group=SIM_rep)) +
    geom_point(color='red') +
    geom_line(color='red', alpha=0.3) +
    geom_point(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    scale_y_continuous(limits=c(4, 7.5)) +
    labs(x='Buoyant density', y='Shannon index') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

min/max abundances of taxa



In [65]:

    
%%R -h 300 -w 800

# simulated
df.SIM.s = df.SIM %>% 
    filter(rel_abund > 0) %>%
    group_by(SIM_rep, BD_mid) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    rename('Buoyant_density' = BD_mid) %>%
    mutate(dataset = 'simulated')

# emperical
df.EMP.s = df.EMP %>%
    group_by(Buoyant_density) %>%
    mutate(rel_abund = abundance / sum(abundance)) %>%
    filter(rel_abund > 0) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    

# plotting
ggplot(df.j, aes(Buoyant_density, max_abund, color=dataset, group=SIM_rep)) +
    geom_point() +
    geom_line(alpha=0.3) +
    geom_line(data=df.j %>% filter(dataset=='emperical')) +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Maximum relative abundance') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

BD range where an OTU is detected

Do the simulated OTU BD distributions span the same BD range of the emperical data?

Simulated



In [67]:

    
# loading comm files
comm_files = !find $buildDir -name "bulk-core_comm_target.txt"
comm_files









    Out[67]:





['/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/7/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/9/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/8/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/3/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/10/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/2/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/1/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/4/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/5/bulk-core_comm_target.txt',
 '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_rep10/6/bulk-core_comm_target.txt']



In [74]:

    
%%R -i comm_files

df.comm = list()
for (f in comm_files){
    rep = gsub('.+/Day1_rep10/([0-9]+)/.+', '\\1', f)
    df.comm[[rep]] = read.delim(f, sep='\t') %>%
        dplyr::select(library, taxon_name, rel_abund_perc) %>%
        rename('bulk_abund' = rel_abund_perc) %>%
        mutate(bulk_abund = bulk_abund / 100)
}

df.comm = do.call('rbind', df.comm)
df.comm$SIM_rep = gsub('\\.[0-9]+$', '', rownames(df.comm))
rownames(df.comm) = 1:nrow(df.comm)
df.comm %>% head(n=3)









    





  library taxon_name bulk_abund SIM_rep
1       1  OTU.14142 0.05918292       7
2       1      OTU.2 0.05856579       7
3       1  OTU.12920 0.04424833       7



In [75]:

    
%%R

## joining
df.SIM.j = inner_join(df.SIM, df.comm, c('SIM_rep' = 'SIM_rep',
                                         'library' = 'library',
                                         'taxon' = 'taxon_name')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)









    





  library    fraction taxon BD_min BD_mid BD_max count   rel_abund SIM_rep
1       1 1.675-1.681 OTU.1  1.675  1.678  1.681   295 0.016215028       7
2       1 1.681-1.685 OTU.1  1.681  1.683  1.685   291 0.013029462       7
3       1 1.685-1.688 OTU.1  1.685  1.687  1.688    94 0.003159664       7
  bulk_abund
1 0.02801777
2 0.02801777
3 0.02801777

Emperical



In [107]:

    
%%R
bulk_days = c(1)



In [108]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]

Emperical



In [76]:

    
%%R
bulk_days = c(1)



In [77]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [78]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [79]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    dplyr::select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [81]:

    
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD) 
    
df.EMP.j %>% head(n=3)









    





       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1130 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
3 OTU.9833 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   1                   2       23         1.69362     unknown
2   12C-Con   1                   2       23         1.69362     unknown
3   12C-Con   1                   2       23         1.69362     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150721_V4_Lib4      SIP              NA          NA               NA
2 150721_V4_Lib4      SIP              NA          NA               NA
3 150721_V4_Lib4      SIP              NA          NA               NA
  Sample_subtreatment core_dataset   bulk_abund
1                  NA         TRUE 1.234263e-04
2                  NA         TRUE 6.171316e-05
3                  NA         TRUE 0.000000e+00



In [102]:

    
%%R 
# filtering & combining emperical w/ simulated data

## emperical 
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    mutate(dataset = 'emperical',
           SIM_rep = NA)

## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(SIM_rep, taxon) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    rename('OTU' = taxon) %>%
    mutate(dataset = 'simulated')

## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    filter(BD_range_perc > 0,
            mean_rel_abund > 0)

df.j %>% head(n=3)









    





Source: local data frame [3 x 8]

      OTU mean_rel_abund   min_BD   max_BD   BD_range BD_range_perc   dataset
    (chr)          (dbl)    (dbl)    (dbl)      (dbl)         (dbl)     (chr)
1   OTU.1    0.028017773 1.676135 1.773391 0.09725564           100 emperical
2  OTU.10    0.001172550 1.676135 1.773391 0.09725564           100 emperical
3 OTU.100    0.001049124 1.676135 1.773391 0.09725564           100 emperical
Variables not shown: SIM_rep (chr)



In [103]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )



In [106]:

    
%%R -h 400
## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=dataset)) +
    #geom_point(alpha=0.5, shape='O') +
    stat_density2d() +
    scale_x_log10() +
    scale_y_continuous() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    #geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

BD span of just overlapping taxa

Taxa overlapping between emperical data and genomes in dataset
These taxa should have the same relative abundances in both datasets.
- The comm file was created from the emperical dataset phyloseq file.



In [119]:

    
%%R -i targetFile

df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)









    





[1] 194
  cluster
1     212
2     762
3     663
                                                                                                   ssu_ID
1          rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-
2 rRNA_NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207_1746046-1747518_DIR+
3  rRNA_NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053_4003859-4005353_DIR-
                                                                                         genome_fileID
1      /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Acinetobacter_oleivorans_DR1.fna
2 /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Mesorhizobium_australicum_WSM2073.fna
3  /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Dyadobacter_fermentans_DSM_18053.fna
                           genomeID
1      Acinetobacter_oleivorans_DR1
2 Mesorhizobium_australicum_WSM2073
3  Dyadobacter_fermentans_DSM_18053
                                                                   genome_seqID
1          NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1
2 NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207
3  NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053
      OTU
1 OTU.188
2 OTU.226
3 OTU.121
                                                                                                        OTU_taxonomy
1  Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified:Unclassified
2 Bacteria:Proteobacteria:Alphaproteobacteria:Rhizobiales:Phyllobacteriaceae:Mesorhizobium:Unclassified:Unclassified
3                 Bacteria:Bacteroidetes:Cytophagia:Cytophagales:Cytophagaceae:Dyadobacter:Unclassified:Unclassified



In [145]:

    
%%R
# filtering to just target taxa
df.j.t = df.j %>% 
    filter(OTU %in% df.target$OTU) 
df.j %>% nrow %>% print
df.j.t %>% nrow %>% print

## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=SIM_rep)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    #scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )

Check

Are all target (overlapping) taxa the same relative abundances in both datasets?



In [150]:

    
%%R -w 600 -h 500
# formatting data
df.1 = df.j.t %>% 
    filter(dataset == 'simulated') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.2 = df.j.t %>%
    filter(dataset == 'emperical') %>%
    select(SIM_rep, OTU, mean_rel_abund, BD_range, BD_range_perc)

df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
    mutate(BD_diff_perc = BD_range_perc.x - BD_range_perc.y)


df.12$SIM_rep.x = reorder(df.12$SIM_rep.x, df.12$SIM_rep.x  %>% as.numeric)

## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
    facet_wrap(~ SIM_rep.x)
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p1

Correlation between relative abundance and BD_range diff

Are low abundant taxa more variable in their BD span



In [151]:

    
%%R -w 800 -h 500

ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    labs(x='Relative abundance', 
         y='Difference in % of gradient spanned\n(simulated vs emperical)',
         title='Overlapping taxa') +
    facet_wrap(~ SIM_rep.x) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )



In [ ]:

Plotting abundance distributions



In [113]:

    
%%R -w 800 -h 400

plot_abund_dists = function(df.EMP.j, df.SIM.j, sim_rep){
    ## emperical 
    df.EMP.j.f = df.EMP.j %>%
        filter(abundance > 0) %>%
        dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
        mutate(dataset = 'emperical') 
    
    ## simulated
    df.SIM.j.f = df.SIM.j %>%
        filter(SIM_rep == sim_rep) %>%
        filter(count > 0) %>%
        dplyr::select(taxon, fraction, count, BD_mid, bulk_abund) %>%
        rename('OTU' = taxon,
               'sample' = fraction,
               'Buoyant_density' = BD_mid,
               'abundance' = count) %>%
        mutate(dataset = 'simulated') 
    
    df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
        group_by(sample) %>%
        mutate(rel_abund = abundance / sum(abundance))
    
    title = paste0('Simulation Replicate ', sim_rep)
    
    p = ggplot(df.j, aes(Buoyant_density, abundance, fill=OTU)) +
        geom_area(stat='identity', position='dodge', alpha=0.5) +
        labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)', title=title) +
        facet_grid(dataset ~ .) +
        theme_bw() +
        theme( 
            text = element_text(size=16),
            legend.position = 'none',
              axis.title.y = element_text(vjust=1),        
              axis.title.x = element_blank(),
              plot.margin=unit(c(0.1,1,0.1,1), "cm")
             )
    return(p)
    }

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=1)



In [114]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=2)



In [115]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=3)



In [116]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=4)



In [117]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=5)



In [118]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=6)



In [132]:

    
%%R -w 800 -h 400

plot_abund_dists(df.EMP.j, df.SIM.j, sim_rep=10)

Notes

The lack of full-BD span for some simulation replicates is due to certain low/high GC taxa being very abundant