In [1]:
%load_ext rpy2.ipython

In [173]:
%%R
library(coenocliner)
library(compositions)
library(dplyr)
library(tidyr)
library(ggplot2)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘tidyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:coenocliner’:

    expand


  res = super(Function, self).__call__(*new_args, **new_kwargs)

Simulate comunities


In [3]:
%%R 
set.seed(2)
M <- 3                                    # number of species
ming <- 3.5                                # gradient minimum...
maxg <- 7                                  # ...and maximum
meang = mean(c(ming, maxg))
locs <- seq(ming, maxg, length = 100)      # gradient locations
opt  <- rnorm(M, mean=meang, sd=0.5)      #runif(M, min = ming, max = maxg)   # species optima
tol  <- rep(0.25, M)                       # species tolerances
h    <- ceiling(rlnorm(M, meanlog = 10))    # max abundances
pars <- cbind(opt = opt, tol = tol, h = h) # put in a matrix

In [4]:
%%R
mu1 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") #, countParams = list(alpha = 0.5))
head(mu1)


     [,1] [,2] [,3]
[1,]    0    0    0
[2,]    0    0    0
[3,]    0    0    0
[4,]    0    0    0
[5,]    1    0    0
[6,]    0    0    0

In [5]:
%%R -w 600 -h 350
matplot(locs, mu1, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [6]:
%%R 
set.seed(2)
M <- 2                                    # number of species
ming <- 3.5                                # gradient minimum...
maxg <- 7                                  # ...and maximum
meang = mean(c(ming, maxg)) + 0.3
locs <- seq(ming, maxg, length = 100)      # gradient locations
opt  <- rnorm(M, mean=meang, sd=0.5)   #runif(M, min = ming, max = maxg)   # species optima
tol  <- rep(0.25, M)                       # species tolerances
h    <- ceiling(rlnorm(M, meanlog = 9.5))    # max abundances
pars <- cbind(opt = opt, tol = tol, h = h) # put in a matrix

In [7]:
%%R

mu2 <- coenocline(locs, responseModel = "gaussian", params = pars,
                    countModel = "poisson") #, countParams = list(alpha = 0.5))
mu2 = cbind(mu1, mu2)
head(mu2)


     [,1] [,2] [,3] [,4] [,5]
[1,]    0    0    0    0    0
[2,]    0    0    0    0    0
[3,]    0    0    0    0    0
[4,]    0    0    0    0    0
[5,]    1    0    0    0    0
[6,]    0    0    0    0    0

In [8]:
%%R -w 600 -h 350

matplot(locs, mu2, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


Relative abundances


In [9]:
%%R

to.rel = function(x){
    y = sum(x)
    if(y == 0){
        return(x)
    } else {
        return(x / y)
    }
}

mu1_rel = t(apply(mu1, 1, to.rel))
mu2_rel = t(apply(mu2, 1, to.rel))

# check
apply(mu1_rel, 1, sum)


  [1] 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [10]:
%%R -w 600 -h 350

matplot(locs, mu1_rel, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [11]:
%%R -w 600 -h 350

matplot(locs, mu2_rel, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


CLR transformation

With raw compositional data


In [12]:
%%R -w 600 -h 350

mu1_clr = clr(mu1_rel)
matplot(locs, mu1_clr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [13]:
%%R -w 600 -h 350

mu2_clr = clr(mu2_rel)
matplot(locs, mu2_clr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [14]:
%%R -w 600 -h 350

matplot(locs, mu2_clr[,c(3,5)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


ALR transformation (from most abundant taxon)


In [15]:
%%R
apply(mu2, 2, sum)


[1]  126235  360437  445578 1158892   76601

In [16]:
%%R -w 600 -h 350

mu2_alr = alr(mu2_rel, ivar=5)
matplot(locs, mu2_alr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


extrapolating beyond veil-line abundances


In [17]:
%%R -w 600 -h 350

mu1_rel_nZ = mu1_rel
mu1_rel_nZ[mu1_rel_nZ == 0] = min(mu1_rel[mu1_rel > 0]) / 2

mu1_clr = clr(mu1_rel_nZ)
matplot(locs, mu1_clr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [18]:
%%R -w 600 -h 350

mu2_rel_nZ = mu2_rel
mu2_rel_nZ[mu2_rel_nZ == 0] = min(mu2_rel[mu2_rel > 0]) / 2

mu2_clr = clr(mu2_rel_nZ)
matplot(locs, mu2_clr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [19]:
%%R -w 600 -h 350

mu2_alr = alr(mu2_rel_nZ, ivar=5)
matplot(locs, mu2_alr, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


spline-fitting sandbox


In [20]:
%%R -w 600 -h 350

ss_fit = function(y, x){
    #ss = smooth.spline(x,y)
    #return(ss$y)
    ss = pspline::smooth.Pspline(x,y,method=3)
    #plot(ss)
    return(ss$ysmth)
}

mu2_clrSsInv = clrInv(apply(mu2_clr, 2, ss_fit, x=locs))

matplot(locs, mu2_clrSsInv, lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [21]:
%%R
center.mass = function(y,x){
    wm = weighted.mean(x, y)
    return(wm)    
}

apply(mu2, 2, center.mass, x=locs) %>% print
apply(mu2_rel, 2, center.mass, x=locs) %>% print 
apply(mu2_clr, 2, center.mass, x=locs) %>% print 
apply(mu2_clrSsInv, 2, center.mass, x=locs) %>% print


[1] 4.800725 5.342898 6.043901 5.101303 5.642460
[1] 4.186292 5.417059 6.373717 4.925863 5.720374
[1] 158.675512   5.761936  23.205569   3.359175   4.051528
[1] 4.103747 5.232884 6.308480 4.898518 5.185135

In [ ]:


In [22]:
%%R -w 600 -h 350

ss3 = smooth.spline(locs, mu2_clr[,3])
ss5 = smooth.spline(locs, mu2_clr[,5])

matplot(locs, mu2_clr[,c(3,5)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")
lines(ss3, lty=2, col='black')
lines(ss5, lty=2, col='red')



In [23]:
%%R -w 600 -h 350

plot_clr_spline = function(col, mu, mu_rel, mu_clr, locs){
    ss = smooth.spline(locs, mu_clr[,col])
    plot(locs, mu[,col])
    plot(locs, mu_rel[,col])
    plot(locs, mu_clr[,col])
    plot(locs, ss$y)
    plot(locs, clrInv(ss$y))
    }

plot_clr_spline(1, mu2, mu2_rel, mu2_clr, locs)



In [24]:
%%R -w 600 -h 350

plot_clr_spline(2, mu2, mu2_rel, mu2_clr, locs)



In [25]:
%%R -w 600 -h 350

plot_clr_spline(3, mu2, mu2_rel, mu2_clr, locs)



In [26]:
%%R -w 600 -h 350

plot_clr_spline(4, mu2, mu2_rel, mu2_clr, locs)



In [27]:
%%R -w 600 -h 350

plot_clr_spline(5, mu2, mu2_rel, mu2_clr, locs)


Calculating center of mass


In [29]:
%%R

center.mass = function(locs, mu, col, ss=FALSE){
    if(ss==TRUE){
        y = smooth.spline(locs, mu[,col])
        y = clrInv(y$y)
    } else {
        y = mu[,col]
    }
    wm = weighted.mean(locs, y)
    print(wm)
    }

col = 1
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


[1] 4.800725
[1] 4.186292
[1] 3.93535

In [30]:
%%R

col = 2
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


[1] 5.342898
[1] 5.417059
[1] 5.203474

In [31]:
%%R

col = 3
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


[1] 6.043901
[1] 6.373717
[1] 6.778234

In [32]:
%%R

col = 4
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


[1] 5.101303
[1] 4.925863
[1] 4.45979

In [33]:
%%R

col = 5
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


[1] 5.64246
[1] 5.720374
[1] 6.072672

In [34]:
%%R

col = 6
center.mass(locs, mu2, col) 
center.mass(locs, mu2_rel, col)
center.mass(locs, mu2_clr, col, ss=TRUE)


Error in mu[, col] : subscript out of bounds
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Error in mu[, col] : subscript out of bounds

  res = super(Function, self).__call__(*new_args, **new_kwargs)

Calculating compositional stats


In [35]:
%%R
# mean
x = acomp(mu2_rel)
mean(x)


[1] 0.16276251 0.16075382 0.25461552 0.33122418 0.09064397
attr(,"class")
[1] acomp

In [36]:
%%R
mvar(x) %>% print
msd(x) %>% print


[1] 22.89199
[1] 2.29597

In [37]:
%%R
variation(x)


          [,1]      [,2]      [,3]      [,4]      [,5]
[1,]  0.000000  7.924724 37.709954  2.519249 18.803430
[2,]  7.924724  0.000000 11.364752  1.534914  2.329903
[3,] 37.709954 11.364752  0.000000 21.038942  3.592937
[4,]  2.519249  1.534914 21.038942  0.000000  7.641137
[5,] 18.803430  2.329903  3.592937  7.641137  0.000000

In [38]:
%%R -h 300
matplot(locs, mu2[,c(1,4)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")
matplot(locs, mu2_rel[,c(1,4)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")
matplot(locs, mu2_clr[,c(1,4)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")



In [39]:
%%R -h 300
matplot(locs, mu2[,c(1,3)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")
matplot(locs, mu2_rel[,c(1,3)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")
matplot(locs, mu2_clr[,c(1,3)], lty = "solid", type = "p", pch = 1:10, cex = 0.8,
        xlab = "pH", ylab = "Abundance")


correlogram

  • bray curtis / jaccard distances vs BD difference

In [156]:
%%R
library(vegan)

In [210]:
%%R -w 800
locs.ap = expand.grid(locs, locs)
locs.ap$diff = locs.ap %>% apply(1, diff) %>% abs %>% as.vector

locs.ap = locs.ap %>%
    spread(Var1, diff) 
rownames(locs.ap) = locs.ap$Var2
locs.ap$Var2 = NULL
locs.dist = locs.ap %>% as.matrix
locs.dist[upper.tri(locs.dist, diag=TRUE)] = 0
locs.dist = locs.dist %>% as.dist
plot(hclust(locs.dist))



In [226]:
%%R
mu1.dist = vegdist(mu1)
mu1.dist[is.nan(mu1.dist)] = 0
stopifnot((mu2.dist %>% length) == (locs.dist %>% length))

mu1.mcorr = mantel.correlog(mu1.dist, locs.dist, n.class=24)
plot(mu1.mcorr)



In [227]:
%%R
mu2.dist = vegdist(mu2)
mu2.dist[is.nan(mu2.dist)] = 0
stopifnot((mu2.dist %>% length) == (locs.dist %>% length))

mu2.mcorr = mantel.correlog(mu2.dist, locs.dist, n.class=24)
plot(mu2.mcorr)



In [ ]:


In [ ]:


In [ ]:


In [ ]:

Aitchison distance sandbox


In [40]:
%%R
library(robCompositions)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: sROC 0.1-2 loaded

  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [41]:
%%R
data(expenditures)
x = xOrig = expenditures
head(x)


  housing foodstuffs alcohol other services
1     640        328     147   169      196
2    1800        484     515  2291      912
3    2085        445     725  8373     1732
4     616        331     126   117      149
5     875        368     191   290      275
6     770        364     196   242      236

In [42]:
%%R

to.rel = function(x){
    x[is.na(x)] = 0
    y = sum(x)
    if(is.na(y) | y == 0){
        x = rep(0, length(x))
        return(x)
    } else {
        return(x / y)
    }
}

x_rel = apply(x, 1, function(x) to.rel(x))
x_rel = t(x_rel)
head(x_rel)


       housing foodstuffs    alcohol      other  services
[1,] 0.4324324 0.22162162 0.09932432 0.11418919 0.1324324
[2,] 0.2999000 0.08063979 0.08580473 0.38170610 0.1519494
[3,] 0.1560629 0.03330838 0.05426647 0.62672156 0.1296407
[4,] 0.4600448 0.24719940 0.09410007 0.08737864 0.1112771
[5,] 0.4377189 0.18409205 0.09554777 0.14507254 0.1375688
[6,] 0.4258850 0.20132743 0.10840708 0.13384956 0.1305310

In [43]:
%%R
## calculate the relative Aitchsion distance between xOrig and xImp:
aDist(x_rel[1,], x_rel[2,])


[1] 1.626736

Aitchison pairwise distance matrix


In [107]:
%%R

.aDist = function(x, df){
    i = x[1]
    ii = x[2]
    aDist(df[i,], df[ii,])
}

aDist.all = function(df, meta=NULL){
    # pairwise aDist on all rows
    i.ii = combn(1:nrow(df), m=2)
    vals = apply(i.ii, 2, .aDist, df=df)
    
    vals = cbind(i.ii %>% t, vals)
    vals = as.data.frame(vals)
    colnames(vals) = c('V1', 'V2', 'aDist')
    
    # adding metadata if provided
    if(! is.null(meta)){
        if(class(meta) == 'matrix'){
            meta = as.data.frame(meta)
        }
        if(class(meta) != 'data.frame'){
            meta = data.frame(1:length(meta), meta)
            colnames(meta)[1] = c('V1')
        }
        
        name1 = colnames(vals)[1]
        name2 = colnames(meta)[1]
        vals = inner_join(vals, meta, c(V1 = name2))
        vals = inner_join(vals, meta, c(V2 = name2))
    }
    
    return(vals)
}


test = mu2_rel_nZ[1:5,]
test.locs = data.frame(1:5, locs[1:5])
colnames(test.locs) = c('V1', 'pH')
aDist.all(test, test.locs)


   V1 V2    aDist     pH.x     pH.y
1   1  2  0.00000 3.500000 3.535354
2   1  3  0.00000 3.500000 3.570707
3   1  4  0.00000 3.500000 3.606061
4   1  5 10.47175 3.500000 3.641414
5   2  3  0.00000 3.535354 3.570707
6   2  4  0.00000 3.535354 3.606061
7   2  5 10.47175 3.535354 3.641414
8   3  4  0.00000 3.570707 3.606061
9   3  5 10.47175 3.570707 3.641414
10  4  5 10.47175 3.606061 3.641414

In [108]:
%%R
mu2_rel_nZ_aDist = aDist.all(mu2_rel_nZ)

ggplot(mu2_rel_nZ_aDist, aes(V1, V2, fill=aDist)) +
    geom_tile()



In [ ]:

variogram sandbox


In [142]:
%%R
library(gstat)
library(sp)

data(meuse)

coordinates(meuse) = ~x+y
lzn.vgm = variogram(log(zinc)~1, meuse)
lzn.fit = fit.variogram(lzn.vgm, model = vgm(1, "Sph", 900, 1))

plot(lzn.vgm, lzn.fit)



In [143]:
%%R
?vgram.sph


R Help on ‘vgram.sph’variograms            package:compositions             R Documentation

_V_a_r_i_o_g_r_a_m _f_u_n_c_t_i_o_n_s

_D_e_s_c_r_i_p_t_i_o_n:

     Valid scalar variogram model functions.

_U_s_a_g_e:

     vgram.sph( h , nugget = 0, sill = 1, range= 1,... )
     vgram.exp( h , nugget = 0, sill = 1, range= 1,... )
     vgram.gauss( h , nugget = 0, sill = 1, range= 1,... )
     vgram.lin( h , nugget = 0, sill = 1, range= 1,... )
     vgram.pow( h , nugget = 0, sill = 1, range= 1,... )
     vgram.nugget( h , nugget = 1,...,tol=1E-8 )
     
_A_r_g_u_m_e_n_t_s:

       h: a vector providing distances, a matrix of distance vectors in
          its rows or a data.frame of distance vectors.

  nugget: The size of the nugget effect (i.e. the limit to 0). At zero
          itself the value is always 0.

    sill: The sill (i.e. the limit to infinity)

   range: The range parameter. I.e. the distance in which sill is
          reached or if this does not exist, where the value is in some
          sense nearly the sill.

     ...: not used

     tol: The distance that is considered as nonzero.

_D_e_t_a_i_l_s:

     The univariate variograms are used in the CompLinCoReg as building
     blocks of multivariate variogram models.

        • sphSpherical variogram

        • expExponential variogram

        • gaussThe Gaussian variogram.

        • linLinear Variogram. Increases over the sill, which is
          reached at ‘range’.

        • powThe power variogram. Increases over the sill, which is
          reached at ‘range’.

        • nuggetThe pure nugget effect variogram.

_V_a_l_u_e:

     A vector of size NROW(h), giving the variogram values.

_A_u_t_h_o_r(_s):

     K.Gerald v.d. Boogaart <URL: http://www.stat.boogaart.de>

_R_e_f_e_r_e_n_c_e_s:

     Cressie, N.C. (1993) Spatial statistics

     Tolosana, van den Boogaart, Pawlowsky-Glahn (2009) Estimating and
     modeling variograms of compositional data with occasional missing
     variables in R, StatGis09

_S_e_e _A_l_s_o:

     ‘vgram2lrvgram’, ‘CompLinModCoReg’, ‘vgmFit’

_E_x_a_m_p_l_e_s:

     ## Not run:
     
     data(juraset)
     X <- with(juraset,cbind(X,Y))
     comp <- acomp(juraset,c("Cd","Cu","Pb","Co","Cr"))
     lrv <- logratioVariogram(comp,X,maxdist=1,nbins=10)
     plot(lrv)
     ## End(Not run)
     

Making aDist variogram


In [110]:
%%R
df.locs = data.frame(1:length(locs), locs)
colnames(df.locs) = c('V1', 'pH')

mu2_rel_nZ_aDist = aDist.all(mu2_rel_nZ, df.locs) %>%
    mutate(delta.pH = abs(pH.x - pH.y))

ggplot(mu2_rel_nZ_aDist, aes(delta.pH, aDist)) +
    geom_point()


Sandbox


In [146]:
%%R
data(SimulatedAmounts)
meanCol(sa.lognormals)
variation(acomp(sa.lognormals))


          Cu        Zn       Pb
Cu 0.0000000 0.4046994 2.938182
Zn 0.4046994 0.0000000 2.910539
Pb 2.9381816 2.9105389 0.000000