Goal

Simulating fullCyc Day1 control gradients
- Not simulating incorporation (all 0% isotope incorp.)
  - Don't know how much true incorporatation for emperical data
Using parameters inferred from emperical data (fullCyc Day1 seq data), or if not available, default SIPSim parameters
Determining whether simulated taxa show similar distribution to the emperical data

Input parameters

phyloseq.bulk file
taxon mapping file
list of genomes
fragments simulated for all genomes
bulk community richness

workflow

Creating a community file from OTU abundances in bulk soil samples
- phyloseq.bulk --> OTU table --> filter to sample --> community table format
Fragment simulation
- simulated_fragments --> parse out fragments for target OTUs
- simulated_fragments --> parse out fragments from random genomes to obtain richness of interest
- combine fragment python objects
Convert fragment lists to kde object
Add diffusion
Make incorp config file
Add isotope incorporation
Calculating BD shift from isotope incorp
Simulating gradient fractions
Simulating OTU table
Simulating PCR
Subsampling from the OTU table

Init



In [80]:

    
import os
import glob
import re
import nestly



In [81]:

    
%load_ext rpy2.ipython









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython



In [82]:

    
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)

## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66

Nestly

assuming fragments already simulated



In [83]:

    
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/'
buildDir = os.path.join(workDir, 'Day1_default_run')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile= '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags.pkl'
targetFile = '/home/nick/notebook/SIPSim/dev/fullCyc/CD-HIT/target_taxa.txt'

physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'


prefrac_comm_abundance = ['1e9']
richness = 2503   # chao1 estimate for bulk Day 1

seq_per_fraction = ['lognormal', 9.432, 0.5, 10000, 30000] # dist, mean, scale, min, max
bulk_days = [1]
nprocs = 24



In [84]:

    
# building tree structure
nest = nestly.Nest()

## varying params
nest.add('abs', prefrac_comm_abundance)

## set params
nest.add('bulk_day', bulk_days, create_dir=False)
nest.add('percIncorp', [0], create_dir=False)
nest.add('percTaxa', [0], create_dir=False)
nest.add('np', [nprocs], create_dir=False)
nest.add('richness', [richness], create_dir=False)
nest.add('subsample_dist', [seq_per_fraction[0]], create_dir=False)
nest.add('subsample_mean', [seq_per_fraction[1]], create_dir=False)
nest.add('subsample_scale', [seq_per_fraction[2]], create_dir=False)
nest.add('subsample_min', [seq_per_fraction[3]], create_dir=False)
nest.add('subsample_max', [seq_per_fraction[4]], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('targetFile', [targetFile], create_dir=False)
nest.add('physeqDir', [physeqDir], create_dir=False)
nest.add('physeq_bulkCore', [physeq_bulkCore], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')



In [85]:

    
%%writefile $bashFile
#!/bin/bash

export PATH={R_dir}:$PATH

#-- making DNA pool similar to gradient of interest
echo '# Creating comm file from phyloseq'
phyloseq2comm.r {physeqDir}{physeq_bulkCore} -s 12C-Con -d {bulk_day} > {physeq_bulkCore}_comm.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm.txt

echo '## Adding target taxa to comm file'
comm_add_target.r {physeq_bulkCore}_comm.txt {targetFile} > {physeq_bulkCore}_comm_target.txt
printf 'Number of lines: '; wc -l {physeq_bulkCore}_comm_target.txt

echo '# Adding extra richness to community file'
printf "1\t{richness}\n" > richness_needed.txt
comm_add_richness.r -s {physeq_bulkCore}_comm_target.txt richness_needed.txt > {physeq_bulkCore}_comm_all.txt
### renaming comm file for downstream pipeline
cat {physeq_bulkCore}_comm_all.txt > {physeq_bulkCore}_comm_target.txt
rm -f {physeq_bulkCore}_comm_all.txt 

echo '## parsing out genome fragments to make simulated DNA pool resembling the gradient of interest'
## all OTUs without an associated reference genome will be assigned a random reference (of the reference genome pool)
### this is done through --NA-random
SIPSim fragment_KDE_parse {fragFile} {physeq_bulkCore}_comm_target.txt \
    --rename taxon_name --NA-random > fragsParsed.pkl


echo '#-- SIPSim pipeline --#'
echo '# converting fragments to KDE'
SIPSim fragment_KDE \
    fragsParsed.pkl \
    > fragsParsed_KDE.pkl
    
echo '# adding diffusion'    
SIPSim diffusion \
    fragsParsed_KDE.pkl \
    --np {np} \
    > fragsParsed_KDE_dif.pkl    

echo '# adding DBL contamination'
SIPSim DBL \
    fragsParsed_KDE_dif.pkl \
    --np {np} \
    > fragsParsed_KDE_dif_DBL.pkl      
    
echo '# making incorp file'
SIPSim incorpConfigExample \
  --percTaxa {percTaxa} \
  --percIncorpUnif {percIncorp} \
  > {percTaxa}_{percIncorp}.config

echo '# adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    fragsParsed_KDE_dif_DBL.pkl \
    {percTaxa}_{percIncorp}.config \
    --comm {physeq_bulkCore}_comm_target.txt \
    --np {np} \
    > fragsParsed_KDE_dif_DBL_inc.pkl
    
#echo '# calculating BD shift from isotope incorporation'
#SIPSim BD_shift \
#    fragsParsed_KDE_dif_DBL.pkl \
#    fragsParsed_KDE_dif_DBL_inc.pkl \
#    --np {np} \
#    > fragsParsed_KDE_dif_DBL_inc_BD-shift.txt

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    {physeq_bulkCore}_comm_target.txt \
    > fracs.txt 

echo '# simulating an OTU table'
SIPSim OTU_table \
    fragsParsed_KDE_dif_DBL_inc.pkl \
    {physeq_bulkCore}_comm_target.txt \
    fracs.txt \
    --abs {abs} \
    --np {np} \
    > OTU_abs{abs}.txt
    
#echo '# simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt









    



Writing /home/nick/notebook/SIPSim/dev/fullCyc/n1147_frag_norm_9_2.5_n5/Day1_default_run/SIPSimRun.sh



In [86]:

    
!chmod 777 $bashFile
!cd $workDir; \
    nestrun  --template-file $bashFile -d Day1_default_run --log-file log.txt -j 1









    



2016-01-15 13:44:26,845 * INFO * Template: ./SIPSimRun.sh
2016-01-15 13:44:26,847 * INFO * [42819] Started ./SIPSimRun.sh in Day1_default_run/1e9
2016-01-15 14:24:38,567 * INFO * [42819] Day1_default_run/1e9 Finished with 0

Checking amplicon fragment BD distribution

'Raw' fragments



In [87]:

    
workDir1 = os.path.join(workDir, 'Day1_default_run/1e9/')

!cd $workDir1; \
    SIPSim KDE_info \
    -s fragsParsed_KDE.pkl \
    > fragsParsed_KDE_info.txt









    



Loading KDEs...



In [88]:

    
%%R -i workDir1

inFile = file.path(workDir1, 'fragsParsed_KDE_info.txt')
df = read.delim(inFile, sep='\t') %>%
    filter(KDE_ID == 1)

df %>% head(n=3)









    





  taxon_ID KDE_ID      min percentile_5 percentile_25     mean   median
1  OTU.752      1 1.710800     1.712144      1.714322 1.716290 1.715908
2  OTU.750      1 1.709851     1.711380      1.712711 1.713594 1.713583
3  OTU.756      1 1.703666     1.707121      1.709378 1.709833 1.709944
  percentile_75 percentile_95      max       stdev
1      1.718243      1.721190 1.724413 0.002704577
2      1.714485      1.715770 1.718975 0.001298266
3      1.710605      1.711759 1.714322 0.001326787



In [89]:

    
%%R -w 600 -h 300
ggplot(df, aes(median)) +
    geom_histogram(binwidth=0.001) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

fragments w/ diffusion + DBL



In [90]:

    
workDir1 = os.path.join(workDir, 'Day1_default_run/1e9/')

!cd $workDir1; \
    SIPSim KDE_info \
    -s fragsParsed_KDE_dif_DBL.pkl \
    > fragsParsed_KDE_dif_DBL_info.pkl









    



Loading KDEs...



In [91]:

    
%%R -i workDir1

inFile = file.path(workDir1, 'fragsParsed_KDE_dif_DBL_info.pkl')
df = read.delim(inFile, sep='\t') %>%
    filter(KDE_ID == 1)

df %>% head(n=3)









    





  taxon_ID KDE_ID      min percentile_5 percentile_25     mean   median
1  OTU.752      1 1.649934     1.710127      1.713732 1.716368 1.716223
2  OTU.750      1 1.649259     1.708946      1.711720 1.713671 1.713564
3  OTU.756      1 1.647352     1.704917      1.707947 1.709891 1.709897
  percentile_75 percentile_95      max       stdev
1      1.718874      1.722828 1.797751 0.004901727
2      1.715464      1.718452 1.796662 0.004131223
3      1.711787      1.714551 1.793228 0.004158380



In [92]:

    
%%R -w 600 -h 300
ggplot(df, aes(median)) +
    geom_histogram(binwidth=0.001) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

BD min/max

what is the min/max BD that we care about?



In [93]:

    
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')









    





Min BD: 1.67323 
Max BD: 1.7744

Plotting number of taxa in each fraction

Emperical data (fullCyc)



In [120]:

    
%%R 
# simulated OTU table file
OTU.table.dir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/Day1_default_run/1e9/'
OTU.table.file = 'OTU_abs1e9_PCR_sub.txt'
#OTU.table.file = 'OTU_abs1e9_sub.txt'
#OTU.table.file = 'OTU_abs1e9.txt'



In [121]:

    
%%R -i physeqDir -i physeq_SIP_core -i bulk_days

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con' & 
                                physeq.SIP.core.m$Day %in% bulk_days, 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core









    



The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.






    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [122]:

    
%%R -w 800 -h 300 

## dataframe
df.EMP = physeq.SIP.core %>% otu_table %>%
    as.matrix %>% as.data.frame
df.EMP$OTU = rownames(df.EMP)
df.EMP = df.EMP %>%    
    gather(sample, abundance, 1:(ncol(df.EMP)-1)) 

df.EMP = inner_join(df.EMP, physeq.SIP.core.m, c('sample' = 'X.Sample')) 

df.EMP.nt = df.EMP %>%
    group_by(sample) %>%
    mutate(n_taxa = sum(abundance > 0)) %>%
    ungroup() %>%
    distinct(sample) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    

## plotting
p = ggplot(df.EMP.nt, aes(Buoyant_density, n_taxa)) +
    geom_point(color='blue') +
    geom_line(color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

w/ simulated data



In [123]:

    
%%R -w 800 -h 300

# loading file
F = file.path(workDir1, OTU.table.file)
df.SIM = read.delim(F, sep='\t')

## edit table
df.SIM.nt = df.SIM %>%
    filter(count > 0) %>%
    group_by(library, BD_mid) %>%
    summarize(n_taxa = n()) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    

## plot
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa)) +
    geom_point(color='red') +
    geom_line(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p



In [124]:

    
%%R -w 800 -h 300
# normalized by max number of taxa

## edit table
df.SIM.nt = df.SIM.nt %>%
    group_by() %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))

df.EMP.nt = df.EMP.nt %>%
    group_by() %>%
    mutate(n_taxa_norm = n_taxa / max(n_taxa))


## plot
p = ggplot(df.SIM.nt, aes(BD_mid, n_taxa_norm)) +
    geom_point(color='red') +
    geom_line(color='red') +
    geom_point(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.nt, aes(x=Buoyant_density), color='blue') +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    scale_y_continuous(limits=c(0, 1)) +
    labs(x='Buoyant density', y='Number of taxa\n(fraction of max)') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

Total sequence count



In [125]:

    
%%R -w 800 -h 300

# simulated
df.SIM.s = df.SIM %>%
    group_by(library, BD_mid) %>%
    summarize(total_abund = sum(count)) %>%
    rename('Day' = library, 'Buoyant_density' = BD_mid) %>%
    ungroup() %>%
    mutate(dataset='simulated')
# emperical
df.EMP.s = df.EMP %>% 
    group_by(Day, Buoyant_density) %>%
    summarize(total_abund = sum(abundance))  %>%
    ungroup() %>%
    mutate(dataset='emperical')

# join
df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.SIM.s = df.EMP.s = ""

# plot
ggplot(df.j, aes(Buoyant_density, total_abund, color=dataset)) +
    geom_point() +
    geom_line() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Total sequences per sample') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Plotting Shannon diversity for each



In [126]:

    
%%R
shannon_index_long = function(df, abundance_col, ...){
    # calculating shannon diversity index from a 'long' formated table
    ## community_col = name of column defining communities
    ## abundance_col = name of column defining taxon abundances
    df = df %>% as.data.frame
    cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
    df.s = df %>%
        group_by_(...) %>%
        mutate_(REL_abundance = cmd) %>%
        mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
               shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
        ungroup() %>% 
        dplyr::select(-REL_abundance, -pi__ln_pi) %>%
        distinct_(...) 
    return(df.s)
}



In [127]:

    
%%R
# calculating shannon
df.SIM.shan = shannon_index_long(df.SIM, 'count', 'library', 'fraction') %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)



In [128]:

    
%%R -w 800 -h 300
# plotting
p = ggplot(df.SIM.shan, aes(BD_mid, shannon)) +
    geom_point(color='red') +
    geom_line(color='red') +
    geom_point(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    geom_line(data=df.EMP.shan, aes(x=Buoyant_density), color='blue') +
    scale_y_continuous(limits=c(4, 7.5)) +
    labs(x='Buoyant density', y='Shannon index') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

min/max abundances of taxa



In [129]:

    
%%R -h 300 -w 800

# simulated
df.SIM.s = df.SIM %>% 
    filter(rel_abund > 0) %>%
    group_by(BD_mid) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    rename('Buoyant_density' = BD_mid) %>%
    mutate(dataset = 'simulated')

# emperical
df.EMP.s = df.EMP %>%
    group_by(Buoyant_density) %>%
    mutate(rel_abund = abundance / sum(abundance)) %>%
    filter(rel_abund > 0) %>%
    summarize(min_abund = min(rel_abund),
              max_abund = max(rel_abund)) %>%
    ungroup() %>%
    mutate(dataset = 'emperical')

df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    

# plotting
ggplot(df.j, aes(Buoyant_density, max_abund, color=dataset, group=dataset)) +
    geom_point() +
    geom_line() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Maximum relative abundance') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Plotting rank-abundance of heavy fractions

In heavy fractions, is DBL resulting in approx. equal abundances among taxa?



In [130]:

    
%%R -w 900


# simulated
df.SIM.s = df.SIM %>%
    select(BD_mid, rel_abund) %>%
    rename('Buoyant_density' = BD_mid) %>%
    mutate(dataset='simulated') 
# emperical
df.EMP.s = df.EMP %>% 
    group_by(Buoyant_density) %>%
    mutate(rel_abund = abundance / sum(abundance)) %>%
    ungroup() %>%
    filter(rel_abund > 0) %>%
    select(Buoyant_density, rel_abund) %>%
    mutate(dataset='emperical') 

# join
df.j = rbind(df.SIM.s, df.EMP.s) %>%
    filter(Buoyant_density > 1.73) %>%
    mutate(Buoyant_density = round(Buoyant_density, 3),
           Buoyant_density_c = as.character(Buoyant_density))

df.j$Buoyant_density_c = reorder(df.j$Buoyant_density_c, df.j$Buoyant_density)

ggplot(df.j, aes(Buoyant_density_c, rel_abund)) +
    geom_boxplot() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Buoyant density', y='Maximum relative abundance') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1),
        legend.position = 'none'
    )

BD range where an OTU is detected

Do the simulated OTU BD distributions span the same BD range of the emperical data?

Simulated



In [131]:

    
%%R

# loading comm file
F = file.path(workDir1, 'bulk-core_comm_target.txt')
df.comm = read.delim(F, sep='\t') %>%
    dplyr::select(library, taxon_name, rel_abund_perc) %>%
    rename('bulk_abund' = rel_abund_perc) %>%
    mutate(bulk_abund = bulk_abund / 100)


## joining
df.SIM.j = inner_join(df.SIM, df.comm, c('library' = 'library',
                                         'taxon' = 'taxon_name')) %>%
    filter(BD_mid >= min_BD, 
           BD_mid <= max_BD)
    
df.SIM.j %>% head(n=3)









    





  library    fraction taxon BD_min BD_mid BD_max count   rel_abund bulk_abund
1       1 1.675-1.677 OTU.1  1.675  1.676  1.677   103 0.009994178 0.02801777
2       1 1.677-1.681 OTU.1  1.677  1.679  1.681   110 0.009394483 0.02801777
3       1 1.681-1.687 OTU.1  1.681  1.684  1.687   220 0.015486414 0.02801777

Emperical



In [132]:

    
%%R
bulk_days = c(1)



In [133]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [134]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [135]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    dplyr::select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [136]:

    
%%R
# joining tables
df.EMP.j = inner_join(df.EMP, bulk.otu, c('OTU' = 'OTU')) %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
    
df.EMP.j %>% head(n=3)









    





       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1101 12C-Con.D1.R2_F23         2           134    TCGACGAG    TGAGTACG
2 OTU.1130 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
3 OTU.9833 12C-Con.D1.R2_F23         0           134    TCGACGAG    TGAGTACG
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   1                   2       23         1.69362     unknown
2   12C-Con   1                   2       23         1.69362     unknown
3   12C-Con   1                   2       23         1.69362     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150721_V4_Lib4      SIP              NA          NA               NA
2 150721_V4_Lib4      SIP              NA          NA               NA
3 150721_V4_Lib4      SIP              NA          NA               NA
  Sample_subtreatment core_dataset   bulk_abund
1                  NA         TRUE 1.234263e-04
2                  NA         TRUE 6.171316e-05
3                  NA         TRUE 0.000000e+00



In [137]:

    
%%R -h 400
# filtering & combining emperical w/ simulated data

## emperical 
max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    mutate(dataset = 'emperical')

## simulated
max_BD_range = max(df.SIM.j$BD_mid) - min(df.SIM.j$BD_mid)
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    group_by(taxon) %>%
    summarize(mean_rel_abund = mean(bulk_abund),
              min_BD = min(BD_mid),
              max_BD = max(BD_mid),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() %>%
    rename('OTU' = taxon) %>%
    mutate(dataset = 'simulated')

## join
df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    filter(BD_range_perc > 0,
            mean_rel_abund > 0)

## plotting
ggplot(df.j, aes(mean_rel_abund, BD_range_perc, color=dataset)) +
    geom_point(alpha=0.5, shape='O') +
    #stat_density2d() +
    #scale_fill_gradient(low='white', high='red', na.value='grey50') +
    #scale_x_log10(limits=c(min(df.j$mean_rel_abund, na.rm=T), 1e-2)) +
    #scale_y_continuous(limits=c(90, 100)) +
    scale_x_log10() +
    scale_y_continuous() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    #geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

BD span of just overlapping taxa

Taxa overlapping between emperical data and genomes in dataset
These taxa should have the same relative abundances in both datasets.
- The comm file was created from the emperical dataset phyloseq file.



In [148]:

    
%%R -i targetFile

df.target = read.delim(targetFile, sep='\t')
df.target %>% nrow %>% print
df.target %>% head(n=3)









    





[1] 194
  cluster
1     212
2     762
3     663
                                                                                                   ssu_ID
1          rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-
2 rRNA_NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207_1746046-1747518_DIR+
3  rRNA_NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053_4003859-4005353_DIR-
                                                                                         genome_fileID
1      /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Acinetobacter_oleivorans_DR1.fna
2 /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Mesorhizobium_australicum_WSM2073.fna
3  /var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/Dyadobacter_fermentans_DSM_18053.fna
                           genomeID
1      Acinetobacter_oleivorans_DR1
2 Mesorhizobium_australicum_WSM2073
3  Dyadobacter_fermentans_DSM_18053
                                                                   genome_seqID
1          NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1
2 NC_019973_Mesorhizobium_australicum_WSM2073__Mesorhizobium_australicum_WSM207
3  NC_013037_Dyadobacter_fermentans_DSM_18053__Dyadobacter_fermentans_DSM_18053
      OTU
1 OTU.188
2 OTU.226
3 OTU.121
                                                                                                        OTU_taxonomy
1  Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified:Unclassified
2 Bacteria:Proteobacteria:Alphaproteobacteria:Rhizobiales:Phyllobacteriaceae:Mesorhizobium:Unclassified:Unclassified
3                 Bacteria:Bacteroidetes:Cytophagia:Cytophagales:Cytophagaceae:Dyadobacter:Unclassified:Unclassified



In [149]:

    
%%R
# filtering to just target taxa
df.j.t = df.j %>% 
    filter(OTU %in% df.target$OTU) 

## plotting
ggplot(df.j.t, aes(mean_rel_abund, BD_range_perc, color=dataset)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    scale_color_manual(values=c('blue', 'red')) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

Check

Are all target (overlapping) taxa the same relative abundances in both datasets?



In [203]:

    
%%R -w 380 -h 350
# formatting data
df.1 = df.j.t %>% 
    filter(dataset == 'simulated') %>%
    select(OTU, mean_rel_abund, BD_range, BD_range_perc)

df.2 = df.j.t %>%
    filter(dataset == 'emperical') %>%
    select(OTU, mean_rel_abund, BD_range, BD_range_perc)

df.12 = inner_join(df.1, df.2, c('OTU' = 'OTU')) %>%
    mutate(BD_diff_perc = BD_range_perc.x - BD_range_perc.y)
df.12 %>% head

## plotting
p1 = ggplot(df.12, aes(mean_rel_abund.x, mean_rel_abund.y)) +
    geom_point(alpha=0.5) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x='Relative abundance (simulated)', y='Relative abundance (emperical)') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p1

Correlation between relative abundance and BD_range diff

Are low abundant taxa more variable in their BD span



In [205]:

    
%%R -w 500 -h 350

ggplot(df.12, aes(mean_rel_abund.x, BD_diff_perc)) +
    geom_point() +
    scale_x_log10() +
    labs(x='Relative abundance', 
         y='Difference in % of gradient spanned\n(simulated vs emperical)',
         title='Overlapping taxa') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )

Plotting abundance distributions



In [113]:

    
%%R 

## emperical 
df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    dplyr::select(OTU, sample, abundance, Buoyant_density, bulk_abund) %>%
    mutate(dataset = 'emperical') 

## simulated
df.SIM.j.f = df.SIM.j %>%
    filter(count > 0) %>%
    dplyr::select(taxon, fraction, count, BD_mid, bulk_abund) %>%
    rename('OTU' = taxon,
           'sample' = fraction,
           'Buoyant_density' = BD_mid,
           'abundance' = count) %>%
    mutate(dataset = 'simulated') 

df.j = rbind(df.EMP.j.f, df.SIM.j.f) %>%
    group_by(sample) %>%
    mutate(rel_abund = abundance / sum(abundance))

df.j %>% head(n=3) %>% as.data.frame









    





       OTU            sample abundance Buoyant_density   bulk_abund   dataset
1 OTU.1101 12C-Con.D1.R2_F23         2         1.69362 1.234263e-04 emperical
2 OTU.1249 12C-Con.D1.R2_F23         1         1.69362 0.000000e+00 emperical
3 OTU.1653 12C-Con.D1.R2_F23         1         1.69362 6.171316e-05 emperical
     rel_abund
1 1.527417e-04
2 7.637086e-05
3 7.637086e-05



In [114]:

    
%%R -w 800 -h 400

# plotting absolute abundances of subsampled
## plot
p = ggplot(df.j, aes(Buoyant_density, abundance, fill=OTU)) +
    geom_area(stat='identity', position='dodge', alpha=0.5) +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none',
          axis.title.y = element_text(vjust=1),        
          axis.title.x = element_blank(),
          plot.margin=unit(c(0.1,1,0.1,1), "cm")
         )
p



In [115]:

    
%%R -w 800 -h 400

# plotting relative abundances of subsampled
p = ggplot(df.j, aes(Buoyant_density, rel_abund, fill=OTU)) +
    geom_area(stat='identity', position='dodge', alpha=0.5) +
    #geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Subsampled community\n(relative abundance)') +
    facet_grid(dataset ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none',
          axis.title.y = element_text(vjust=1),        
          axis.title.x = element_blank(),
          plot.margin=unit(c(0.1,1,0.1,1), "cm")
         )
p

--OLD--

Determining the pre-fractionation abundances of taxa in each gradient fraction

emperical data
low-abundant taxa out at the tails?
OR broad distributions of high abundant taxa



In [114]:

    
%%R
physeq.SIP.core.n = transform_sample_counts(physeq.SIP.core, function(x) x/sum(x))
physeq.SIP.core.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 7025 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 7025 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 7025 tips and 7024 internal nodes ]



In [118]:

    
%%R
physeq.dir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq.bulk = 'bulk-core'
physeq.file = file.path(physeq.dir, physeq.bulk)
physeq.bulk = readRDS(physeq.file)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Day %in% bulk_days, physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [119]:

    
%%R
physeq.bulk.n = transform_sample_counts(physeq.bulk, function(x) x/sum(x))
physeq.bulk.n









    





phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]



In [136]:

    
%%R
# making long format of SIP OTU table
SIP.otu = physeq.SIP.core.n %>% otu_table %>% as.data.frame
ncol = ncol(SIP.otu)
SIP.otu$OTU = rownames(SIP.otu)
SIP.otu = SIP.otu %>%
    gather(sample, abundance, 1:ncol) 

SIP.otu = inner_join(physeq.SIP.core.m, SIP.otu, c('X.Sample' = 'sample')) %>%
    select(-core_dataset, -Sample_location, -Sample_date, -Sample_treatment,
           -Sample_subtreatment, -library, -Sample_type)
SIP.otu %>% head(n=3)









    





           X.Sample primer_number fwd_barcode rev_barcode Substrate Day
1 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
2 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
3 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
  Microcosm_replicate Fraction Buoyant_density Exp_type      OTU    abundance
1                   2       23         1.69362      SIP OTU.1101 0.0001485001
2                   2       23         1.69362      SIP OTU.1130 0.0000000000
3                   2       23         1.69362      SIP OTU.9833 0.0000000000



In [137]:

    
%%R
# making long format of each bulk table
bulk.otu = physeq.bulk.n %>% otu_table %>% as.data.frame
ncol = ncol(bulk.otu)
bulk.otu$OTU = rownames(bulk.otu)
bulk.otu = bulk.otu %>%
    gather(sample, abundance, 1:ncol) 

bulk.otu = inner_join(physeq.bulk.m, bulk.otu, c('X.Sample' = 'sample')) %>%
    select(OTU, abundance) %>%
    rename('bulk_abund' = abundance)
bulk.otu %>% head(n=3)









    





       OTU   bulk_abund
1 OTU.1101 1.234263e-04
2 OTU.1130 6.171316e-05
3 OTU.9833 0.000000e+00



In [138]:

    
%%R
# joining tables
SIP.otu = inner_join(SIP.otu, bulk.otu, c('OTU' = 'OTU'))
SIP.otu %>% head(n=3)









    





           X.Sample primer_number fwd_barcode rev_barcode Substrate Day
1 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
2 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
3 12C-Con.D1.R2_F23           134    TCGACGAG    TGAGTACG   12C-Con   1
  Microcosm_replicate Fraction Buoyant_density Exp_type      OTU    abundance
1                   2       23         1.69362      SIP OTU.1101 0.0001485001
2                   2       23         1.69362      SIP OTU.1130 0.0000000000
3                   2       23         1.69362      SIP OTU.9833 0.0000000000
    bulk_abund
1 1.234263e-04
2 6.171316e-05
3 0.000000e+00



In [147]:

    
%%R -w 900 -h 900
# for each gradient, plotting gradient rel_abund vs bulk rel_abund

ggplot(SIP.otu, aes(bulk_abund, abundance)) +
    geom_point(alpha=0.2) +
    geom_point(shape='O', alpha=0.6) +
    facet_wrap(~ Buoyant_density) +
    labs(x='Pre-fractionation relative abundance', 
         y='Fraction relative abundance') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [152]:

    
%%R -w 900 -h 900
# for each gradient, plotting gradient rel_abund vs bulk rel_abund

ggplot(SIP.otu, aes(bulk_abund, abundance)) +
    geom_point(alpha=0.2) +
    geom_point(shape='O', alpha=0.6) +
    scale_x_continuous(limits=c(0,0.01)) +
    scale_y_continuous(limits=c(0,0.01)) +
    facet_wrap(~ Buoyant_density) +
    labs(x='Pre-fractionation relative abundance', 
         y='Fraction relative abundance') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)
    )

Plotting the abundance distribution of top 10 most abundant taxa (bulk samples)



In [169]:

    
%%R -w 500 -h 300
# checking bulk rank-abundance
tmp = bulk.otu %>%
    mutate(rank = row_number(-bulk_abund))

ggplot(tmp, aes(rank, bulk_abund)) +
    geom_point()



In [183]:

    
%%R -w 900
top.n = filter(tmp, rank <= 10)
SIP.otu.f = SIP.otu %>%
    filter(OTU %in% top.n$OTU) 

ggplot(SIP.otu.f, aes(Buoyant_density, abundance, group=OTU, fill=OTU)) +
    #geom_point() +
    #geom_line() +
    geom_area(position='dodge', alpha=0.4) +
    labs(y='Relative abundance', x='Buoyant density') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [209]:

    
%%R -w 600 -h 400
# Number of gradients that each OTU is found in

max_BD_range = max(SIP.otu$Buoyant_density) - min(SIP.otu$Buoyant_density)

SIP.otu.f = SIP.otu %>%
    filter(abundance > 0) %>%
    group_by(OTU) %>%
    summarize(bulk_abund = mean(bulk_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / max_BD_range * 100) %>%
    ungroup() 


ggplot(SIP.otu.f, aes(bulk_abund, BD_range_perc, group=OTU)) +
    geom_point() +
    scale_x_log10() +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    geom_vline(xintercept=0.001, linetype='dashed', alpha=0.5) +
    theme_bw() +
    theme(
        text = element_text(size=16)
        )

Notes

Currently, simulated taxa with an abundance of >0.1% are detected in approx. all fractions.