Description

  • Trimming the FullCyc 12C-Con HR-SIP dataset to match the reference genome pool size

Setting variables


In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc_trim/'

physeqDir = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/phyloseq/'
physeq_bulkCore = 'bulk-core'
physeq_SIP_core = 'SIP-core_unk'

ampFrag_KDE_info = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde_info.txt'

Init


In [2]:
import os
import sys
%load_ext rpy2.ipython
%load_ext pushnote

In [3]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(phyloseq)
library(vegan)
library(robCompositions)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘gridExtra’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:dplyr’:

    combine


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: permute

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: lattice

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: This is vegan 2.3-5

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: robustbase

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: data.table

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: data.table 1.9.6  For help type ?data.table or https://github.com/Rdatatable/data.table/wiki

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘data.table’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:

    between, last


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: pls

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘pls’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:vegan’:

    scores


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:stats’:

    loadings


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: sROC 0.1-2 loaded

  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [4]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)
    
%cd $workDir


/home/nick/notebook/SIPSim/dev/fullCyc_trim

In [5]:
%%R
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

cat('Min BD:', min_BD, '\n')
cat('Max BD:', max_BD, '\n')


Min BD: 1.67323 
Max BD: 1.7744 

Checking how many taxa actually amplify for the in-silco PCR


In [6]:
%%R -i ampFrag_KDE_info 

ntaxa = read.delim(ampFrag_KDE_info, sep='\t') %>%
    filter(! is.nan(KDE_ID)) %>%
    distinct(taxon_ID) %>% nrow

cat('Number of taxa that have amplicon-fragments: ', ntaxa, '\n')


Number of taxa that have amplicon-fragments:  1102 

Parsing out amplicon fragments of just the taxa that amplified


In [8]:
%%R -i workDir

F = file.path(workDir, 'ampFrags_kde_amplified.txt')

taxa = read.delim(ampFrag_KDE_info, sep='\t') %>%
    filter(! is.nan(KDE_ID)) %>% 
    select(taxon_ID) %>%
    distinct(taxon_ID) %>%
    rename('genomeID' = taxon_ID)
              
write.table(taxa, F, quote=FALSE, row.names=FALSE, col.names=TRUE)
cat('File written:', F, '\n')


File written: /home/nick/notebook/SIPSim/dev/fullCyc_trim//ampFrags_kde_amplified.txt 

Loading data

Pre-fractionation samples


In [17]:
%%R -i physeqDir -i physeq_bulkCore

physeq.file = file.path(physeqDir, physeq_bulkCore)
physeq.bulk = readRDS(physeq.file)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk = prune_samples(physeq.bulk.m$Exp_type == 'microcosm_bulk' &
                            physeq.bulk.m$Substrate == '12C-Con', physeq.bulk)

physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 6 samples ]
sample_data() Sample Data:       [ 6 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]

Fraction samples


In [18]:
%%R -i physeqDir -i physeq_SIP_core 

# bulk core samples
F = file.path(physeqDir, physeq_SIP_core)
physeq.SIP.core = readRDS(F) 
physeq.SIP.core.m = physeq.SIP.core %>% sample_data

physeq.SIP.core = prune_samples(physeq.SIP.core.m$Substrate == '12C-Con', 
                                physeq.SIP.core) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.SIP.core.m = physeq.SIP.core %>% sample_data        

physeq.SIP.core


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 9974 taxa and 142 samples ]
sample_data() Sample Data:       [ 142 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 9974 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 9974 tips and 9973 internal nodes ]

Selecting taxa from pre-frac sample

  • Just N-most abundant taxa
    • N = number of genomes in simulations

In [19]:
%%R
df.bulk.otu = physeq.bulk %>% 
    otu_table %>% 
    as.data.frame 

df.bulk.otu$OTU = rownames(df.bulk.otu)

df.bulk.otu = df.bulk.otu %>%
    gather(sample, count, ends_with('_bulk')) %>%
    group_by(sample) %>%
    mutate(rank_abund = row_number(-count)) %>%
    ungroup() %>%
    filter(rank_abund <= ntaxa) %>%
    arrange(rank_abund) 
    
df.bulk.otu %>% nrow %>% print
df.bulk.otu$OTU %>% unique %>% length %>% print
df.bulk.otu %>% head(n=3)


[1] 6612
[1] 2611
Source: local data frame [3 x 4]

        OTU              sample count rank_abund
      (chr)               (chr) (dbl)      (int)
1     OTU.1  12C-Con.D3.R3_bulk   976          1
2     OTU.1  12C-Con.D6.R2_bulk   779          1
3 OTU.14142 12C-Con.D30.R1_bulk   234          1

In [20]:
%%R
# filtering taxa from fraction samples & performing total-sum scaling (closure operation)

bulk.samples = df.bulk.otu$sample %>% unique

physeq.SIP.l = list()
for (bs in bulk.samples){
    df.otus = df.bulk.otu %>%
        filter(sample == bs)
    day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', bs) %>% as.numeric
    physeq.SIP.l[[bs]] = prune_samples(physeq.SIP.core.m$Day == day, physeq.SIP.core)
    physeq.SIP.l[[bs]] = prune_taxa(df.otus$OTU,  physeq.SIP.l[[bs]]) %>%
                        transform_sample_counts(function(x) x/sum(x))
                
}

physeq.SIP.l


$`12C-Con.D3.R3_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1097 taxa and 23 samples ]
sample_data() Sample Data:       [ 23 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1097 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1097 tips and 1096 internal nodes ]

$`12C-Con.D6.R2_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1096 taxa and 23 samples ]
sample_data() Sample Data:       [ 23 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1096 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1096 tips and 1095 internal nodes ]

$`12C-Con.D30.R1_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1100 taxa and 24 samples ]
sample_data() Sample Data:       [ 24 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1100 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1100 tips and 1099 internal nodes ]

$`12C-Con.D1.R2_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1099 taxa and 25 samples ]
sample_data() Sample Data:       [ 25 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1099 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1099 tips and 1098 internal nodes ]

$`12C-Con.D48.R3_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1099 taxa and 24 samples ]
sample_data() Sample Data:       [ 24 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1099 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1099 tips and 1098 internal nodes ]

$`12C-Con.D14.R1_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1100 taxa and 23 samples ]
sample_data() Sample Data:       [ 23 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1100 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1100 tips and 1099 internal nodes ]

Saving phyloseq object


In [12]:
%%R -i workDir
outFile = 'SIP-core_unk_trm'
outFile = file.path(workDir, outFile)

saveRDS(physeq.SIP.l, outFile)
cat('File written:', outFile, '\n')


File written: /home/nick/notebook/SIPSim/dev/fullCyc_trim//SIP-core_unk_trm 

Making a pruned pre-fractionation phyloseq file


In [21]:
%%R
# filtering taxa from fraction samples & performing total-sum scaling (closure operation)

bulk.samples = df.bulk.otu$sample %>% unique

physeq.bulk.l = list()
for (bs in bulk.samples){
    df.otus = df.bulk.otu %>%
        filter(sample == bs)
    day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', bs) %>% as.numeric
    physeq.bulk.l[[bs]] = prune_samples(physeq.bulk.m$Day == day, physeq.bulk)
    physeq.bulk.l[[bs]] = prune_taxa(df.otus$OTU,  physeq.bulk.l[[bs]]) %>%
                        transform_sample_counts(function(x) x/sum(x))
                
}

physeq.bulk.l


$`12C-Con.D3.R3_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]

$`12C-Con.D6.R2_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]

$`12C-Con.D30.R1_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]

$`12C-Con.D1.R2_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]

$`12C-Con.D48.R3_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]

$`12C-Con.D14.R1_bulk`
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1102 taxa and 1 samples ]
sample_data() Sample Data:       [ 1 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 1102 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 1102 tips and 1101 internal nodes ]


In [14]:
%%R -i workDir
outFile = 'bulk-core_trm'
outFile = file.path(workDir, outFile)

saveRDS(physeq.bulk.l, outFile)
cat('File written:', outFile, '\n')


File written: /home/nick/notebook/SIPSim/dev/fullCyc_trim//bulk-core_trm 

Converting to long OTU table (reload)

  • Just abundant OTUs in bulk soil samples (above rank-order cutoff) will have an abundance value; others get abundance of 0

SIP data


In [22]:
%%R -i workDir
F = file.path(workDir, 'SIP-core_unk_trm')
physeq.SIP.l = readRDS(F)
physeq.SIP.l %>% names


[1] "12C-Con.D3.R3_bulk"  "12C-Con.D6.R2_bulk"  "12C-Con.D30.R1_bulk"
[4] "12C-Con.D1.R2_bulk"  "12C-Con.D48.R3_bulk" "12C-Con.D14.R1_bulk"

In [23]:
%%R

otu2df = function(x){
    x %>% otu_table %>% as.data.frame
    }

otu.l = lapply(physeq.SIP.l, otu2df) 

samps = names(otu.l)
df.SIP.otu = otu.l[[samps[1]]]
df.SIP.otu$OTU = rownames(df.SIP.otu)
for (x in samps[2:length(samps)]){
    y = otu.l[[x]] 
    y$OTU = rownames(y)
    df.SIP.otu = left_join(df.SIP.otu, y, c('OTU' = 'OTU'))
}

otu.l = NULL
df.SIP.otu[is.na(df.SIP.otu)] = 0
df.SIP.otu[1:5, 1:5]


  12C-Con.D3.R3_F20 12C-Con.D3.R3_F18 12C-Con.D3.R3_F11 12C-Con.D3.R3_F22
1      0.0000000000      0.0008324084      0.0000000000                 0
2      0.0000000000      0.0000000000      0.0000000000                 0
3      0.0000000000      0.0000000000      0.0001549427                 0
4      0.0000000000      0.0000000000      0.0000000000                 0
5      0.0002917153      0.0000000000      0.0001549427                 0
  12C-Con.D3.R3_F8
1                0
2                0
3                0
4                0
5                0

In [24]:
%%R 
# convert physeq object to a dataframe
df.EMP = df.SIP.otu %>%    
    gather(sample, abundance, starts_with('12C-Con')) 

df.EMP = inner_join(df.EMP, physeq.SIP.core %>% sample_data, c('sample' = 'X.Sample')) %>%
    mutate(Day = Day %>% as.character)

df.EMP$Day = reorder(df.EMP$Day, df.EMP$Day %>% as.numeric)

df.EMP$Day %>% unique %>% print
df.EMP %>% head(n=3)


[1] 3  6  30 1  48 14
Levels: 1 3 6 14 30 48
       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1130 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
2 OTU.9833 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
3 OTU.1472 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   3                   3       20        1.701269     unknown
2   12C-Con   3                   3       20        1.701269     unknown
3   12C-Con   3                   3       20        1.701269     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150727_V4_Lib5      SIP              NA          NA               NA
2 150727_V4_Lib5      SIP              NA          NA               NA
3 150727_V4_Lib5      SIP              NA          NA               NA
  Sample_subtreatment core_dataset
1                  NA         TRUE
2                  NA         TRUE
3                  NA         TRUE

Bulk soil data


In [25]:
%%R -i workDir
F = file.path(workDir, 'bulk-core_trm')
physeq.bulk.l = readRDS(F)
physeq.bulk.l %>% names


[1] "12C-Con.D3.R3_bulk"  "12C-Con.D6.R2_bulk"  "12C-Con.D30.R1_bulk"
[4] "12C-Con.D1.R2_bulk"  "12C-Con.D48.R3_bulk" "12C-Con.D14.R1_bulk"

In [26]:
%%R

otu2df = function(x){
    x %>% otu_table %>% as.data.frame
    }

otu.l = lapply(physeq.bulk.l, otu2df) 

samps = names(otu.l)
df.bulk.otu = otu.l[[samps[1]]]
df.bulk.otu$OTU = rownames(df.bulk.otu)
for (x in samps[2:length(samps)]){
    y = otu.l[[x]] 
    y$OTU = rownames(y)
    df.bulk.otu = left_join(df.bulk.otu, y, c('OTU' = 'OTU'))
}

otu.l = NULL
df.bulk.otu[is.na(df.bulk.otu)] = 0
df.bulk.otu = df.bulk.otu %>%    
    gather(sample, abundance, starts_with('12C-Con')) %>%
    mutate(Day = gsub('.+\\.D([0-9]+)\\.R.+', '\\1', sample)) %>%
    rename('preFrac_abund' = abundance,
           'preFrac_sample' = sample)

df.bulk.otu %>% head(n=3)


       OTU     preFrac_sample preFrac_abund Day
1 OTU.1130 12C-Con.D3.R3_bulk  9.923588e-05   3
2 OTU.9833 12C-Con.D3.R3_bulk  9.923588e-05   3
3 OTU.1472 12C-Con.D3.R3_bulk  9.923588e-05   3

BD range


In [27]:
%%R
# joining SIP & bulk
df.EMP.j = inner_join(df.EMP, df.bulk.otu, c('OTU' = 'OTU',
                                               'Day' = 'Day'))
df.EMP.j %>% head(n=3) %>% as.data.frame


       OTU            sample abundance primer_number fwd_barcode rev_barcode
1 OTU.1130 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
2 OTU.9833 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
3 OTU.1472 12C-Con.D3.R3_F20         0            86    CGTGAGTG    GTCTATGA
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   3                   3       20        1.701269     unknown
2   12C-Con   3                   3       20        1.701269     unknown
3   12C-Con   3                   3       20        1.701269     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150727_V4_Lib5      SIP              NA          NA               NA
2 150727_V4_Lib5      SIP              NA          NA               NA
3 150727_V4_Lib5      SIP              NA          NA               NA
  Sample_subtreatment core_dataset     preFrac_sample preFrac_abund
1                  NA         TRUE 12C-Con.D3.R3_bulk  9.923588e-05
2                  NA         TRUE 12C-Con.D3.R3_bulk  9.923588e-05
3                  NA         TRUE 12C-Con.D3.R3_bulk  9.923588e-05

In [28]:
%%R

#max_BD_range = max(df.EMP.j$Buoyant_density) - min(df.EMP.j$Buoyant_density)

df.EMP.j.f = df.EMP.j %>%
    filter(abundance > 0) %>%
    group_by(Day) %>%
    mutate(max_BD_range = max(Buoyant_density) - min(Buoyant_density)) %>%
    ungroup() %>%
    group_by(OTU, Day) %>%
    summarize(mean_preFrac_abund = mean(preFrac_abund),
              min_BD = min(Buoyant_density),
              max_BD = max(Buoyant_density),
              BD_range = max_BD - min_BD,
              BD_range_perc = BD_range / first(max_BD_range) * 100) %>%
    ungroup() 

df.EMP.j.f %>% head


Source: local data frame [6 x 7]

    OTU   Day mean_preFrac_abund   min_BD   max_BD   BD_range BD_range_perc
  (chr) (chr)              (dbl)    (dbl)    (dbl)      (dbl)         (dbl)
1 OTU.1     1         0.03021430 1.676135 1.773391 0.09725564           100
2 OTU.1    14         0.03381978 1.677228 1.771206 0.09397736           100
3 OTU.1     3         0.09685422 1.675043 1.767927 0.09288460           100
4 OTU.1    30         0.02016188 1.675043 1.766835 0.09179184           100
5 OTU.1    48         0.01324705 1.675043 1.769020 0.09397736           100
6 OTU.1     6         0.07431079 1.676135 1.769020 0.09288460           100

In [29]:
%%R -h 400 -w 800

df.EMP.j.f$Day = reorder(df.EMP.j.f$Day, df.EMP.j.f$Day %>% as.numeric)

## plotting
ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc, color=Day)) +
    geom_point(alpha=0.5, shape='O') +
    scale_x_log10() +
    scale_y_continuous() +
    facet_wrap(~ Day) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank()#,
        #legend.position = 'none'
        )



In [55]:
%%R -h 400 -w 750

df.EMP.j.f$Day = reorder(df.EMP.j.f$Day, df.EMP.j.f$Day %>% as.numeric)

## plotting
p.span.hex = ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc)) +
    geom_hex() +
    scale_x_log10() +
    scale_y_continuous(limits=c(0,105)) +
    scale_fill_gradient(low='grey95', high='black') +
    facet_wrap(~ Day, ncol=3) +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )
p.span.hex



In [32]:
%%R -h 350 -w 550
## plotting
ggplot(df.EMP.j.f, aes(mean_preFrac_abund, BD_range_perc)) +
    geom_hex() +
    scale_x_log10() +
    scale_y_continuous(limits=c(0,105)) +
    scale_fill_gradient(low='grey95', high='black') +
    labs(x='Pre-fractionation abundance', y='% of total BD range') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        panel.grid = element_blank(),
        legend.position = 'none'
        )


Assessing diversity

Asigning zeros


In [33]:
%%R
# giving value to missing abundances
min.pos.val = df.EMP %>%
    filter(abundance > 0) %>%
    group_by() %>%
    mutate(min_abund = min(abundance)) %>%
    ungroup() %>%
    filter(abundance == min_abund)

min.pos.val = min.pos.val[1,'abundance'] %>% as.numeric
imp.val = min.pos.val / 10


# convert numbers
#df.EMP[df.EMP$abundance == 0, 'abundance'] = imp.val

# another closure operation
df.EMP = df.EMP %>%
    group_by(sample) %>%
    mutate(abundance = abundance / sum(abundance),
           Day = Day %>% as.character)

df.EMP$Day = reorder(df.EMP$Day, df.EMP$Day %>% as.numeric)

# status
cat('Below detection level abundances converted to: ', imp.val, '\n')


Below detection level abundances converted to:  4.037957e-06 

Shannon diversity


In [34]:
%%R
# shannon index
shannon_index_long = function(df, abundance_col, ...){
    # calculating shannon diversity index from a 'long' formated table
    ## community_col = name of column defining communities
    ## abundance_col = name of column defining taxon abundances
    ## ... = group_by columns
    df = df %>% as.data.frame
    cmd = paste0(abundance_col, '/sum(', abundance_col, ')')
    df.s = df %>%
        group_by_(...) %>%
        mutate_(REL_abundance = cmd) %>%
        mutate(pi__ln_pi = REL_abundance * log(REL_abundance),
               shannon = -sum(pi__ln_pi, na.rm=TRUE)) %>%
        ungroup() %>% 
        dplyr::select(-REL_abundance, -pi__ln_pi) %>%
        distinct_(...) 
    return(df.s)
}

In [35]:
%%R
# calulcating 
df.EMP.shan = shannon_index_long(df.EMP, 'abundance', 'sample') %>%
    filter(Buoyant_density >= min_BD, 
           Buoyant_density <= max_BD)
df.EMP.shan %>% head(n=3) %>% as.data.frame


       OTU            sample    abundance primer_number fwd_barcode rev_barcode
1 OTU.1130 12C-Con.D3.R3_F20 0.0000000000            86    CGTGAGTG    GTCTATGA
2 OTU.1130 12C-Con.D3.R3_F18 0.0008324084            84    CTGCGTGT    GTCTATGA
3 OTU.1130 12C-Con.D3.R3_F11 0.0000000000            77    TCATCGAG    GTCTGCTA
  Substrate Day Microcosm_replicate Fraction Buoyant_density Sample_type
1   12C-Con   3                   3       20        1.701269     unknown
2   12C-Con   3                   3       18        1.710011     unknown
3   12C-Con   3                   3       11        1.737330     unknown
         library Exp_type Sample_location Sample_date Sample_treatment
1 150727_V4_Lib5      SIP              NA          NA               NA
2 150727_V4_Lib5      SIP              NA          NA               NA
3 150727_V4_Lib5      SIP              NA          NA               NA
  Sample_subtreatment core_dataset  shannon
1                  NA         TRUE 5.209807
2                  NA         TRUE 5.420974
3                  NA         TRUE 4.759772

In [39]:
%%R -w 800 -h 300

x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))

# plotting
p.shan = ggplot(df.EMP.shan, aes(Buoyant_density, shannon, color=Day, group=Day)) +
    geom_point() +
    geom_line(alpha=0.3) +
    labs(x=x.lab, y='Shannon index') +
    theme_bw() +
    theme( 
        text = element_text(size=16)
    )
p.shan



In [40]:
%%R -w 800 -h 300

# summary plot
df.EMP.shan.s = df.EMP.shan %>%
    group_by(BD_bin = ntile(Buoyant_density, 24)) %>%
    summarize(mean_BD = mean(Buoyant_density), 
              mean_shannon = mean(shannon),
              sd_shannon = sd(shannon))

# plotting
p = ggplot(df.EMP.shan.s, aes(mean_BD, mean_shannon,
                              ymin=mean_shannon-sd_shannon, 
                              ymax=mean_shannon+sd_shannon)) +
    geom_pointrange() +
    labs(x='Buoyant density (binned; 24 bins)', y='Mean Shannon index (+/- stdev)') +
    theme_bw() +
    theme( 
        text = element_text(size=16)
    )
p


Plotting variance


In [41]:
%%R -w 800 -h 350
df.EMP.var = df.EMP %>%
    group_by(Day, sample) %>%
    mutate(variance = var(abundance)) %>%
    ungroup() %>%
    distinct(Day, sample) %>%
    select(Day, sample, variance, Buoyant_density)

ggplot(df.EMP.var, aes(Buoyant_density, variance, color=Day)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )



In [42]:
%%R -w 800 -h 350
df.EMP.var.s = df.EMP.var %>%
    group_by(BD_bin = ntile(Buoyant_density, 24)) %>%
    summarize(mean_BD = mean(Buoyant_density),
              mean_var = mean(variance),
              sd_var = sd(variance))

ggplot(df.EMP.var.s, aes(mean_BD, mean_var,
                        ymin=mean_var-sd_var,
                        ymax=mean_var+sd_var)) +
    geom_pointrange() +
    labs(x='Buoyant density (binned; 24 bins)', y='Mean variance (+/- stdev)') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )


Correlograms

delta BDs


In [43]:
%%R
BD.diffs = function(BDs){
    df.BD = expand.grid(BDs, BDs)
    df.BD$diff = df.BD %>% apply(1, diff) %>% abs %>% as.vector    
    df.BD = df.BD %>%
        spread(Var1, diff) 
    rownames(df.BD) = df.BD$Var2
    df.BD$Var2 = NULL
    dist.BD = df.BD %>% as.matrix
    dist.BD[upper.tri(dist.BD, diag=TRUE)] = 0
    dist.BD %>% as.dist
    }


days = df.EMP$Day %>% unique
BDs.l = list()
for(d in days){
    tmp = df.EMP %>% filter(Day == d) 
    BDs.l[[d]] = tmp$Buoyant_density %>% unique
}

BD.dist.l = lapply(BDs.l, BD.diffs)
BD.dist.l %>% names


[1] "3"  "6"  "30" "1"  "48" "14"

Bray-curtis ~ BD_diff


In [44]:
%%R

vegdist.by = function(day, df.EMP, ...){
    df.EMP.w = df.EMP %>%
        filter(Day == day) %>%
        select(OTU, sample, abundance) %>%
        spread(OTU, abundance) %>%
        as.data.frame()
    
    rownames(df.EMP.w) = df.EMP.w$sample
    df.EMP.w$sample = NULL
    
    vegan::vegdist(df.EMP.w, ...)
}


days = df.EMP$Day %>% unique %>% sort

bray.l = list()
for (d in days){
    bray.l[[d]] = vegdist.by(d, df.EMP, method='bray')    
}
bray.l %>% names


[1] "1"  "3"  "6"  "14" "30" "48"

In [45]:
%%R -w 800 -h 650

cal.corr = function(d, BDs, nclass=24, ...){
    #d = vegdist(df, ...)
    d[is.nan(d)] = 0
    stopifnot((d %>% length) == (BDs %>% length))
    
    mantel.correlog(d, BDs, n.class=nclass)
}

days = df.EMP$Day %>% unique %>% sort
corr.l = list()
for(d in days){
    corr.l[[d]] = cal.corr(bray.l[[d]], BD.dist.l[[d]])
}

par(mfrow=c(3,3))
for (n in names(corr.l)){
    plot(corr.l[[n]])
    title(n)
}



In [46]:
%%R
# making df of mantel values
corr.l.mantel = list()
for (n in corr.l %>% names){
    corr.l.mantel[[n]] = corr.l[[n]]['mantel.res'][[1]] %>% as.data.frame
    corr.l.mantel[[n]]$Day = n
    colnames(corr.l.mantel[[n]]) = c('class.index', 'n.dist', 'Mantel.corr', 'Pr', 'Pr.corr', 'Day')
}
df.corr = do.call(rbind, corr.l.mantel) %>% as.data.frame
df.corr %>% head


         class.index n.dist Mantel.corr    Pr Pr.corr Day
1.D.cl.1 0.002561156     44  0.29395823 0.001   0.001   1
1.D.cl.2 0.006590709     28  0.08282747 0.108   0.108   1
1.D.cl.3 0.010620261     42  0.10761009 0.019   0.038   1
1.D.cl.4 0.014649814     52 -0.03945088 0.203   0.216   1
1.D.cl.5 0.018679366     38 -0.10945803 0.024   0.076   1
1.D.cl.6 0.022708919     36 -0.11564685 0.015   0.075   1

In [49]:
%%R -w 700 -h 250

df.corr.e = df.corr %>%
    filter(! is.na(Mantel.corr)) #%>%
#    mutate(Pr.corr = ifelse(is.na(Pr.corr), 1, Pr.corr) %>% as.numeric,
#           Pr.corr.bool = ifelse(Pr.corr < 0.05, '<0.05', '>0.05'))

df.corr.e$Day = reorder(df.corr.e$Day, df.corr.e$Day %>% as.numeric)

p.corr = ggplot(df.corr.e, aes(class.index, Mantel.corr, color=Day)) +
    geom_point(size=2) +
    geom_line(alpha=0.5) +
    labs(x='Class index', y='Mantel correlation') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p.corr



In [48]:
%%R -w 650 -h 300
# summarizing
df.corr.s = df.corr %>%
    filter(! is.na(Mantel.corr)) %>%
    group_by(bin = ntile(class.index, 12)) %>%
    summarize(n = n(),
              mean.class.index = mean(class.index),
              mean.Mantel.corr = mean(Mantel.corr, na.rm=TRUE),
              sd.Mantel.corr = sd(Mantel.corr, na.rm=TRUE),
              max.Pr.corr = max(Pr.corr, na.rm=TRUE)) %>%
    ungroup() %>%
    mutate(significant = ifelse(max.Pr.corr <= 0.05, TRUE, FALSE))


# plotting
ggplot(df.corr.s, aes(mean.class.index, mean.Mantel.corr, color=significant,
                      ymin=mean.Mantel.corr-sd.Mantel.corr,
                      ymax=mean.Mantel.corr+sd.Mantel.corr)) +
    geom_pointrange() +
    scale_color_discrete('P_corr < 0.05') +
    labs(x='Class index (binned; 12 bins)', y='Mean Mantel correlation\n(+/- stdev)') +
    theme_bw() +
    theme( 
        text = element_text(size=16)
    )


Combined plot


In [68]:
%%R -w 600 -h 700
p.comb = cowplot::ggdraw() +
    geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
    cowplot::draw_plot(p.shan, 0.04, 0.75, 0.96, 0.25) +
    cowplot::draw_plot(p.corr, 0.04, 0.50, 0.96, 0.25) +
    cowplot::draw_plot(p.span.hex, 0.04, 0.0, 0.93, 0.50) +
    cowplot::draw_plot_label(c('A)', 'B)', 'C)'), c(0, 0, 0), c(1, 0.75, 0.5))
p.comb



In [69]:
%%R -i workDir
# saving plot
F = file.path(workDir, 'emp-data_summary.pdf') 
ggsave(F, p.comb, width=8.5, height=10)
cat('File written:', F, '\n')


File written: /home/nick/notebook/SIPSim/dev/fullCyc_trim//emp-data_summary.pdf 

Binary ~ BD_diff


In [116]:
%%R
binary.l = list()
for (d in days){
    binary.l[[d]] = vegdist.by(d, df.EMP, binary=TRUE)    
}
binary.l %>% names


[1] "1"  "3"  "6"  "14" "30" "48"

In [117]:
%%R -w 800 -h 650

days = df.EMP$Day %>% unique %>% sort
corr.l = list()
for(d in days){
    corr.l[[d]] = cal.corr(binary.l[[d]], BD.dist.l[[d]])
}

par(mfrow=c(3,3))
for (n in names(corr.l)){
    plot(corr.l[[n]])
    title(n)
}



In [118]:
%%R
# making df of mantel values
corr.l.mantel = list()
for (n in corr.l %>% names){
    corr.l.mantel[[n]] = corr.l[[n]]['mantel.res'][[1]] %>% as.data.frame
    corr.l.mantel[[n]]$Day = n
    colnames(corr.l.mantel[[n]]) = c('class.index', 'n.dist', 'Mantel.corr', 'Pr', 'Pr.corr', 'Day')
}
df.corr = do.call(rbind, corr.l.mantel) %>% as.data.frame
df.corr %>% head


         class.index n.dist Mantel.corr    Pr Pr.corr Day
1.D.cl.1 0.002561156     44  0.10364549 0.003   0.003   1
1.D.cl.2 0.006590709     28  0.03222861 0.319   0.319   1
1.D.cl.3 0.010620261     42  0.03690521 0.212   0.424   1
1.D.cl.4 0.014649814     52 -0.02904201 0.227   0.636   1
1.D.cl.5 0.018679366     38 -0.08017387 0.054   0.216   1
1.D.cl.6 0.022708919     36 -0.11775061 0.014   0.070   1

In [119]:
%%R -w 650 -h 300
# summarizing
df.corr.s = df.corr %>%
    filter(! is.na(Mantel.corr)) %>%
    group_by(bin = ntile(class.index, 12)) %>%
    summarize(n = n(),
              mean.class.index = mean(class.index),
              mean.Mantel.corr = mean(Mantel.corr, na.rm=TRUE),
              sd.Mantel.corr = sd(Mantel.corr, na.rm=TRUE),
              max.Pr.corr = max(Pr.corr, na.rm=TRUE)) %>%
    ungroup() %>%
    mutate(significant = ifelse(max.Pr.corr <= 0.05, TRUE, FALSE))


# plotting
ggplot(df.corr.s, aes(mean.class.index, mean.Mantel.corr, color=significant,
                      ymin=mean.Mantel.corr-sd.Mantel.corr,
                      ymax=mean.Mantel.corr+sd.Mantel.corr)) +
    geom_pointrange() +
    scale_color_discrete('P_corr < 0.05') +
    labs(x='Class index (binned; 12 bins)', y='Mean Mantel correlation\n(+/- stdev)') +
    theme_bw() +
    theme( 
        text = element_text(size=16)
    )


Plotting rank-abundance of top taxa for each day


In [161]:
%%R -w 800

df.EMP.f = df.EMP %>%
    filter(Buoyant_density > 1.72) %>%
    group_by(Day, sample) %>%
    mutate(rank = row_number(-abundance)) %>% 
    ungroup() %>%
    filter(rank <= 5) 

ggplot(df.EMP.f, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
    geom_line(alpha=0.5, size=1.2) +
    labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)', 
         title='Abundant taxa in the heavy fractions') +
    facet_grid(Day ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none',
        axis.title.y = element_text(vjust=1),        
        axis.title.x = element_blank()
        )



In [162]:
%%R -w 800

df.EMP.f2 = df.EMP %>%
    filter(OTU %in% df.EMP.f$OTU)

ggplot(df.EMP.f2, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
    geom_line(alpha=0.5, size=1.2) +
    labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)',
                 title='Abundant taxa in the heavy fractions') +
    facet_grid(Day ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none',
        axis.title.y = element_text(vjust=1),        
        axis.title.x = element_blank()
        )



In [163]:
%%R -w 800


ggplot(df.EMP, aes(Buoyant_density, abundance, fill=OTU, color=OTU)) +
    geom_line(alpha=0.5, size=1.2) +
    labs(x='Buoyant density', y='Subsampled community\n(absolute abundance)',
                 title='All taxa in the trimmed 12C-Con fullCyc dataset') +
    facet_grid(Day ~ .) +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none',
        axis.title.y = element_text(vjust=1),        
        axis.title.x = element_blank()
        )