Goal

Exploring potential methods of comparing HR-SIP gradient data (compositional data) that reflects differences in peak height/width/location in eudlidean space (absolute abundances)

Init



In [88]:

    
%load_ext rpy2.ipython









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython



In [89]:

    
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
library(compositions)
library(coenocliner)
library(vegan)
library(gridExtra)

Creating test communities

varying parameters:
- peak locations (species optima)
- peak widths (species tolerances)
- peak heights (max abundances)

Varying peak locations



In [90]:

    
%%R 
set.seed(2)
M = 1000                                      # number of species
ming = 1.67323                                # gradient minimum...
maxg = 1.76                                # ...and maximum
#meang = mean(c(ming, maxg))
meang = 1.71
locs = seq(ming, maxg, length = 30)        # gradient locations
opt  = rnorm(M, mean=meang, sd=0.005)          # runif(M, min = ming, max = maxg)   # species optima
tol  = rep(0.005, M)                       # species tolerances
h    = ceiling(rlnorm(M, meanlog = 11))    # max abundances
pars = cbind(opt = opt, tol = tol, h = h)  # put in a matrix



In [91]:

    
%%R -w 600 -h 350
opt1  = rnorm(M, mean=meang, sd=0.001)
pars = cbind(opt = opt1, tol = tol, h = h)  # put in a matrix

PL1 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 

# community total abundance
apply(PL1 %>% as.matrix, 1, sum) %>% sum %>% print

matplot(locs, PL1, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")









    





[1] 420828095



In [92]:

    
%%R -w 600 -h 350
opt1  = rnorm(M, mean=meang, sd=0.005)
pars = cbind(opt = opt1, tol = tol, h = h)  # put in a matrix

PL2 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 


matplot(locs, PL2, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [93]:

    
%%R -w 600 -h 350
opt1  = rnorm(M, mean=meang, sd=0.01)
pars = cbind(opt = opt1, tol = tol, h = h)  # put in a matrix

PL3 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 


matplot(locs, PL3, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")

Varying peak widths



In [94]:

    
%%R -w 600 -h 350
tol1  = rep(0.0001, M)                       # species tolerances
pars = cbind(opt = opt, tol = tol1, h = h)  # put in a matrix

PW1 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 


matplot(locs, PW1, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [95]:

    
%%R -w 600 -h 350
tol1  = rep(0.005, M)                       # species tolerances
pars = cbind(opt = opt, tol = tol1, h = h)  # put in a matrix

PW2 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 


matplot(locs, PW2, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [96]:

    
%%R -w 600 -h 350
tol1  = rep(0.01, M)                       # species tolerances
pars = cbind(opt = opt, tol = tol1, h = h)  # put in a matrix

PW3 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 

matplot(locs, PW3, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")

Varying peak heights



In [97]:

    
%%R -w 600 -h 350
h1    = ceiling(rlnorm(M, meanlog = 9))    # max abundances
pars = cbind(opt = opt, tol = tol, h = h1)  # put in a matrix

PH1 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 

matplot(locs, PH1, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [98]:

    
%%R -w 600 -h 350
h1    = ceiling(rlnorm(M, meanlog = 11))    # max abundances
pars = cbind(opt = opt, tol = tol, h = h1)  # put in a matrix

PH2 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 

matplot(locs, PH2, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [99]:

    
%%R -w 600 -h 350
h1    = ceiling(rlnorm(M, meanlog = 13))    # max abundances
pars = cbind(opt = opt, tol = tol, h = h1)  # put in a matrix

PH3 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") 

matplot(locs, PH3, lty = "solid", type = "l", xlab = "BD", ylab = "Abundance")



In [100]:

    
%%R
# making a list
comm.l = list(
"PL1" = PL1,
"PL2" = PL2,
"PL3" = PL3,
"PW1" = PW1,
"PW2" = PW2,
"PW3" = PW3,
"PH1" = PH1,
"PH2" = PH2,
"PH3" = PH3
)
comm.l %>% names









    





[1] "PL1" "PL2" "PL3" "PW1" "PW2" "PW3" "PH1" "PH2" "PH3"

Adding tail-end abundances



In [69]:

    
%%R

add.tail.abund = function(mtx){
    nzero = mtx[mtx == 0] %>% length
    mtx[mtx == 0] = round(runif(nzero, 0, 1), 0)
    return(mtx)
    }

comm.l = lapply(comm.l, add.tail.abund)
comm.l = lapply(comm.l, add.tail.abund)
comm.l[[1]][1:5, 1:5]









    





     [,1] [,2] [,3] [,4] [,5]
[1,]    1    1    1    0    1
[2,]    0    1    1    1    1
[3,]    1    1    0    0    1
[4,]    0    1    1    1    0
[5,]    3    1    0    1    1

Converting to relative abundances



In [127]:

    
%%R
to.rel = function(x){
    y = sum(x)
    if(y == 0){
        return(x)
    } else {
        return(x / y)
    }
}

comm.l.abs = lapply(comm.l, function(x) x %>% apply(1, function(x) x) %>% t %>% as.data.frame)

df = do.call(rbind, comm.l.abs) %>% as.data.frame
df$BD = rep(locs, comm.l.abs %>% names %>% length)
df$dataset = rownames(df) 

df = df %>%
    gather(OTU, rel_abund, starts_with('V')) %>%
    mutate(dataset = gsub('\\.[0-9]+$', '', dataset))    
    
df %>% head









    





        BD dataset OTU rel_abund
1 1.673230     PL1  V1         0
2 1.676222     PL1  V1         0
3 1.679214     PL1  V1         0
4 1.682206     PL1  V1         0
5 1.685198     PL1  V1         3
6 1.688190     PL1  V1        39



In [130]:

    
%%R -w 800 -h 600

comm.l.abs = lapply(comm.l, function(x) x %>% apply(1, function(x) x) %>% t %>% as.data.frame)

df = do.call(rbind, comm.l.abs) %>% as.data.frame
df$BD = rep(locs, comm.l.abs %>% names %>% length)
df$dataset = rownames(df)
df = df %>%
    gather(OTU, abs_abund, starts_with('V')) %>%
    mutate(dataset = gsub('\\.[0-9]+$', '', dataset))

p1 = ggplot(df, aes(BD, abs_abund, group=OTU)) +
    geom_line(alpha=0.5) +
    labs(x='Buoyant density', y='Abundance') +
    facet_wrap(~dataset) +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p1



In [103]:

    
%%R -w 800 -h 600

to.rel = function(x){
    y = sum(x)
    if(y == 0){
        return(x)
    } else {
        return(x / y)
    }
}

comm.l.rel = lapply(comm.l, function(x) x %>% apply(1, to.rel) %>% t %>% as.data.frame)
        
df.rel = do.call(rbind, comm.l.rel) %>% as.data.frame
df.rel$BD = rep(locs, comm.l.rel %>% names %>% length)
df.rel$dataset = rownames(df.rel)
df.rel = df.rel %>%
    gather(OTU, rel_abund, starts_with('V')) %>%
    mutate(dataset = gsub('\\.[0-9]+$', '', dataset))

p1 = ggplot(df.rel, aes(BD, rel_abund, group=OTU)) +
    geom_line() +
    labs(x='Buoyant density', y='Relative abundance') +
    facet_wrap(~dataset) +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p1

Comparing communities

Shannon diversity



In [106]:

    
%%R -w 800
df.shan = lapply(comm.l.rel, function(x) diversity(x))
df.shan = do.call(rbind, df.shan) %>% t %>% as.data.frame
df.shan$BD = locs
df.shan = df.shan %>%
    gather(dataset, shannon, starts_with('P'))

p1 = ggplot(df.shan, aes(BD, shannon)) +
    geom_line() +
    facet_wrap(~dataset) +
    labs(x='Buoyant density', 'Shannon index') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p1



In [119]:

    
%%R -w 500 -h 400
# pairwise correlations for each dataset
df.shan.bin = df.shan %>%
    group_by(BD_bin = ntile(BD, 24))

calc.spearman = function(x){
    cor(x[,'shannon.x'], x['shannon.y'], method='spearman')[1,1]
}

calc.pearson = function(x){
    cor(x[,'shannon.x'], x['shannon.y'], method='pearson')[1,1]
}

df.shan.corr = inner_join(df.shan.bin, df.shan.bin, c('BD_bin' = 'BD_bin')) %>%
    group_by(dataset.x, dataset.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, calc.pearson)) %>%
    unnest(pearson = model %>% purrr::map(function(x) x)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(pearson_txt = round(pearson, 2))
        
 # plotting
ggplot(df.shan.corr, aes(dataset.x, dataset.y, fill=pearson)) +
    geom_tile() +
    geom_text(aes(label=pearson_txt), color='white') +
    scale_fill_gradient('Pearson', low='black', high='red') +
    labs(title='Shannon index') +
    theme(
        text = element_text(size=16),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()      
    )

Compositional variance



In [73]:

    
%%R

total.acomp.var = function(x){
    x.acomp = acomp(x)
    variation(x.acomp) %>% lower.tri %>% sum
    }

lapply(comm.l.rel, total.acomp.var)









    





$PL1
[1] 499500

$PL2
[1] 499500

$PL3
[1] 499500

$PW1
[1] 499500

$PW2
[1] 499500

$PW3
[1] 499500

$PH1
[1] 499500

$PH2
[1] 499500

$PH3
[1] 499500

Variance by fraction



In [74]:

    
%%R -w 800

comm.l.rel.var = lapply(comm.l.rel, function(x) apply(x, 1, var))
df.var = do.call(rbind, comm.l.rel.var) %>% t %>% as.data.frame
df.var$BD = locs
df.var = df.var %>%
    gather(dataset, variance, starts_with('P'))

p1 = ggplot(df.var, aes(BD, variance)) +
    geom_line() +
    facet_wrap(~dataset) +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p1



In [75]:

    
%%R -w 800
p1 + facet_wrap(~ dataset, scale='free_y')

Pairwise pearson correlations



In [120]:

    
%%R -w 500 -h 400
# pairwise correlations for each dataset
df.var.bin = df.var %>%
    group_by(BD_bin = ntile(BD, 24))

#calc.spearman = function(x){
#    cor(x[,'variance.x'], x['variance.y'], method='spearman')[1,1]
#}
calc.pearson = function(x){
    cor(x[,'variance.x'], x['variance.y'], method='pearson')[1,1]
}

df.var.corr = inner_join(df.var.bin, df.var.bin, c('BD_bin' = 'BD_bin')) %>%
    group_by(dataset.x, dataset.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, calc.pearson)) %>%
    unnest(pearson = model %>% purrr::map(function(x) x)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(pearson_txt = round(pearson, 2))

        
 # plotting
ggplot(df.var.corr, aes(dataset.x, dataset.y, fill=pearson)) +
    geom_tile() +
    geom_text(aes(label=pearson_txt), color='white') +        
    scale_fill_gradient(low='black', high='red') +
    theme(
        text = element_text(size=16)
    )

Comparing Shannon & variance spearman values



In [121]:

    
%%R
# plotting shannon vs variance spearman correlations

df.corr.j = inner_join(df.shan.corr, df.var.corr, c('dataset.x' = 'dataset.x',
                                                    'dataset.y' = 'dataset.y'))
                       
ggplot(df.corr.j, aes(pearson.x, pearson.y)) +
    geom_point() +
    labs(x='Pearson correlations of\nShannnon index values',
         y='Pearson correlations of\nvariance values') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )

Notes

Variance and Shannon index are measuring approx. the same thing (at least for these communities)!

Correlogram



In [122]:

    
%%R

BD.diffs = function(BDs){
    df.BD = expand.grid(BDs, BDs)
    df.BD$diff = df.BD %>% apply(1, diff) %>% abs %>% as.vector

    df.BD = df.BD %>%
        spread(Var1, diff) 
    rownames(df.BD) = df.BD$Var2
    df.BD$Var2 = NULL
    dist.BD = df.BD %>% as.matrix
    dist.BD[upper.tri(dist.BD, diag=TRUE)] = 0
    dist.BD %>% as.dist
    }

dist.BDs = BD.diffs(locs)
dist.BDs %>% length

Bray-Curtis



In [123]:

    
%%R -w 800 -h 650

cal.corr = function(df, BDs, nclass=24, ...){
    d = vegdist(df, ...)
    d[is.nan(d)] = 0
    stopifnot((d %>% length) == (BDs %>% length))
    
    mantel.correlog(d, BDs, n.class=nclass)
}

corr.l = lapply(comm.l.rel, cal.corr, BDs=dist.BDs)

par(mfrow=c(3,3))
for (n in names(corr.l)){
    plot(corr.l[[n]])
    title(n)
}

Notes:

The shift to negative correlations at an index of ~0.025 is likely due to the difference in community composition between the tails and center.
The small up-tick at the highest distance classes is due to making the comparison span both tails, but still negative due to including comparisons between tails and center of gradient.
The analysis seems most sensitive to euclidean peak width
- broader peaks = more autocorrelation (as expected)
Peak location seems to affect the strength of correlation
- more disperse peaks = higher correlation values
  - this is due to more autocorrelation across the entire gradient

Pairwise linear regressions for each dataset



In [131]:

    
%%R -w 500 -h 400
# converting to dataframe
tmp.l = list()
for(x in names(corr.l)){
    tmp = corr.l[[x]][1][['mantel.res']] %>% as.data.frame
    tmp$data = x
    tmp = tmp %>%
        filter(!is.na(Mantel.cor)) %>%
        group_by(class.index.bin = ntile(class.index, 12)) 
    tmp.l[[x]] = tmp
}
df.corr = do.call(rbind, tmp.l) %>%
    as.data.frame 

# pairwise linear regressions for each dataset
df.corr = inner_join(df.corr, df.corr, c('class.index.bin' = 'class.index.bin')) %>%
    group_by(data.x, data.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, ~lm(Mantel.cor.y ~ Mantel.cor.x, data=.))) %>%
    unnest(model %>% purrr::map(broom::glance)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(r.squared_txt = round(r.squared, 2))
    

# plotting
ggplot(df.corr, aes(data.x, data.y, fill=r.squared)) +
    geom_tile() +
    geom_text(aes(label=r.squared_txt), color='white', size=4) +
    labs(title='Beta diversity correlogram') +
    scale_fill_gradient(low='black', high='red') +
    theme(
        text = element_text(size=16)
    )

Binary



In [83]:

    
%%R -w 800 -h 650

corr.l = lapply(comm.l.rel, cal.corr, BDs=dist.BDs, binary=TRUE, method='jaccard')

par(mfrow=c(3,3))
for (n in names(corr.l)){
    plot(corr.l[[n]])
    title(n)
}

Notes

The peak correlations at ~0.025 is due to the simulated 'smearing' that I included.
- The smearing caused the tails to contain most of the taxa in the center of the gradient
- The approx. extact center of the gradient should have the most overlap of taxa and thus the most correlation with the tails.
The difference between these plots and the emperical data suggest that this simple smearing simulation is not adequate.
- I would probably need to make the smearing related to pre-fractionation taxon abundance

Pairwise linear regressions



In [84]:

    
%%R -w 500 -h 400
# converting to dataframe
tmp.l = list()
for(x in names(corr.l)){
    tmp = corr.l[[x]][1][['mantel.res']] %>% as.data.frame
    tmp$data = x
    tmp = tmp %>%
        filter(!is.na(Mantel.cor)) %>%
        group_by(class.index.bin = ntile(class.index, 12)) 
    tmp.l[[x]] = tmp
}
df.corr = do.call(rbind, tmp.l) %>%
    as.data.frame 

# pairwise linear regressions for each dataset
df.corr = inner_join(df.corr, df.corr, c('class.index.bin' = 'class.index.bin')) %>%
    group_by(data.x, data.y) %>%
    nest() %>%
    mutate(model = purrr::map(data, ~lm(Mantel.cor.y ~ Mantel.cor.x, data=.))) %>%
    unnest(model %>% purrr::map(broom::glance)) %>%
    ungroup() %>%
    select(-data, -model) %>%
    mutate(r.squared_txt = round(r.squared, 2))
    

# plotting
ggplot(df.corr, aes(data.x, data.y, fill=r.squared)) +
    geom_tile() +
    geom_text(aes(label=r.squared_txt), color='white', size=4) +
    scale_fill_gradient(low='black', high='red') +
    theme(
        text = element_text(size=16)
    )



In [ ]:



In [ ]: