Setup

Check for dependencies


In [1]:
required_packages = c('reshape2',
                      'RColorBrewer',
                      'ggplot2',
                      'Hmisc'
                     )
packages_to_install = required_packages[! required_packages %in%
                                          rownames(installed.packages())]
packages_to_install



In [2]:
if (length(packages_to_install) >= 1L) {
  install.packages(packages_to_install,
                   repos='http://cloud.r-project.org/')
}

Load packages


In [3]:
require(ggplot2)
require(Hmisc)
require(MASS)
require(reshape2)


Loading required package: ggplot2
Loading required package: Hmisc
Loading required package: lattice
Loading required package: survival
Loading required package: Formula

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:base’:

    format.pval, round.POSIXt, trunc.POSIXt, units

Loading required package: MASS
Loading required package: reshape2

In [4]:
source('stats_fns.R')

Clean up data


In [5]:
data = read.csv('DATASET_F_indica_complex_November_2015.csv',
                as.is=TRUE, header=TRUE)
names(data) = sub('..mm.', '', names(data))
names(data) = sub('Taxon', 'Species', names(data), fixed=TRUE)

Subset data set to remove unnecessary columns


In [6]:
data = data[names(data) != 'Collector.number']
numeric_data = data[, sapply(data, class) == 'numeric']

In [7]:
dim(data)
head(data)


  1. 123
  2. 12
SpeciesCulm.lengthLeaf.blade.lengthLeaf.blade.widthLeaf.sheath.lengthInvolucral.bract.lengthSpikelet.lengthSpikelet.widthGlume.length.Glume.widthNutlet.lengthNutlet.width
1F. argyropus250 160 1.5 35 30 5 3 4 2 2 1
2F. argyropus260 90 1 40 35 4 3 4 2.5 2 1.5
3F. argyropus330 150 1 45 30 5 3 4 2 2 1
4F. argyropus340 90 1 35 5 4 2 3 1 2 1
5F. elatior260 110 1 30 15 5 2 5 2 2 1
6F. elatior282.5 145 0.7 45 26 4.6 2 3.6 1 1.4 0.6

Univariate distributions


In [8]:
# num = sapply(data, class) == 'numeric'
# data[,num] = log(data[,num])
# data[,num] = scale(data[,num])
names(data)[2:ncol(data)] = paste(gsub('.', ' ', sub('\\.$', '', names(data)[2:ncol(data)]), fixed=TRUE), '(mm)')
# longdata = melt(data)
dropcols = names(data) %in% c("Glume length (mm)", "Glume width (mm)", "Leaf blade length (mm)")
longdata = melt(data[, !dropcols])
longdata$Species = factor(longdata$Species)
levels(longdata$Species) = substr(gsub('F. |Taxon ', '', levels(longdata$Species)), 1, 3)


Using Species as id variables

In [9]:
gp = ggplot(longdata) #+ scale_colour_brewer(palette='Set2')
gp = gp + facet_wrap(~ variable, nrow=3, scales='free')

In [10]:
# stripplot = gp + aes(y=Species, x=value) + geom_point(size=3, alpha=0.7) +
#   theme_bw()
# plot(stripplot)

In [11]:
vstripplot = gp + aes(x=Species, y=value) + ylab('') +
  geom_point(size=1.4, alpha=1, colour='grey20',
             position=position_jitter(width=0.1, height=0)) +
  stat_summary(fun.data='median_hilow', geom='crossbar', colour='black',
               width=0.6) +
  xlab('') +
  theme_bw(base_size=11)

In [12]:
if (!dir.exists('figures')) {
    dir.create('figures')
}

In [13]:
pdf('figures/univariate.pdf', width=7, height=7, useDingbats=FALSE)
# plot(stripplot)
plot(vstripplot)
dev.off()


pdf: 2

In [14]:
plot(vstripplot)


Summary statistics

No. of samples


In [15]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, length))


Culm.lengthLeaf.blade.lengthLeaf.blade.widthLeaf.sheath.lengthInvolucral.bract.lengthSpikelet.lengthSpikelet.widthGlume.length.Glume.widthNutlet.lengthNutlet.width
F. argyropus44444444444
F. elatior99999999999
F. indica5959595959595959595959
F. laevis77777777777
Taxon A2121212121212121212121
Taxon B2323232323232323232323

Mean


In [16]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, mean))


Culm.lengthLeaf.blade.lengthLeaf.blade.widthLeaf.sheath.lengthInvolucral.bract.lengthSpikelet.lengthSpikelet.widthGlume.length.Glume.widthNutlet.lengthNutlet.width
F. argyropus295.000122.500 1.125 38.750 25.000 4.500 2.750 3.750 1.875 2.000 1.125
F. elatior371.5777778144.2222222 1.0333333 45.0888889 40.9666667 4.7888889 2.0222222 3.9555556 1.4666667 1.5111111 0.7777778
F. indica365.484746118.844068 1.003390 51.200000 26.871186 4.267797 2.027119 3.605085 1.616949 1.747458 1.005085
F. laevis571.428571220.000000 2.428571 70.000000 26.285714 6.000000 3.000000 4.714286 2.214286 1.928571 1.071429
Taxon A73.933333345.1142857 0.657142917.938095211.7238095 3.5380952 1.6000000 3.1761905 1.4476190 1.1571429 0.7571429
Taxon B446.391304137.782609 1.008696 57.704348 34.486957 4.552174 2.208696 3.726087 1.595652 1.682609 1.017391

SD


In [17]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, sd))


Culm.lengthLeaf.blade.lengthLeaf.blade.widthLeaf.sheath.lengthInvolucral.bract.lengthSpikelet.lengthSpikelet.widthGlume.length.Glume.widthNutlet.lengthNutlet.width
F. argyropus46.547466837.7491722 0.2500000 4.787135513.5400640 0.5773503 0.5000000 0.5000000 0.6291529 0.0000000 0.2500000
F. elatior115.6582982 42.3266104 0.4213075 21.7308792 26.3468689 1.0042963 0.7084804 0.5939510 0.3807887 0.3620927 0.1855921
F. indica145.3527431 54.9287384 0.4080932 24.1517116 14.7836589 1.0429469 0.6294033 0.9046668 0.4564301 0.5405476 0.3360130
F. laevis105.2661027 48.9897949 1.1700631 29.2973264 9.8440216 1.0000000 0.5773503 0.7559289 0.5669467 0.3450328 0.1889822
Taxon A45.766366820.8667747 0.302607710.6839354 6.1373369 1.1038461 0.5347897 0.8966074 0.3265257 0.3841503 0.3058945
Taxon B174.1218745 66.8241903 0.4378803 26.3525655 13.5540100 0.7353846 0.5783307 0.7466394 0.3843273 0.3626320 0.3054835

ANOVA


In [18]:
F_out = lapply(numeric_data, F_stats, data$Species)

In [19]:
t(sapply(F_out, function(x) x[1, 4:5]))


F valuePr(>F)
Culm.length2.409635e+011.330106e-16
Leaf.blade.length1.483507e+012.926582e-11
Leaf.blade.width1.530541e+011.470884e-11
Leaf.sheath.length9.850095e+007.051953e-08
Involucral.bract.length7.701819e+002.726092e-06
Spikelet.length7.372713e+004.849922e-06
Spikelet.width7.165786e+006.981943e-06
Glume.length.3.8924635240.002671494
Glume.width3.85271860.0028753
Nutlet.length6.544855e+002.103863e-05
Nutlet.width3.2583688760.008619414

In [20]:
for (i in 1:length(F_out)) {
    IRdisplay::display_html(names(F_out)[i])
    IRdisplay::display_html(repr::repr_html(F_out[[i]]))
}


Culm.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+002.177206e+064.354412e+052.409635e+011.330106e-16
Residuals 117.002114287.41 18070.83 NA NA
Leaf.blade.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+001.996723e+053.993446e+041.483507e+012.926582e-11
Residuals 117.000314951.820 2691.896 NA NA
Leaf.blade.width
DfSum SqMean SqF valuePr(>F)
g 5.000000e+001.669912e+013.339824e+001.530541e+011.470884e-11
Residuals 117.0000000 25.5307972 0.2182119 NA NA
Leaf.sheath.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+002.542053e+045.084105e+039.850095e+007.051953e-08
Residuals 117.000060389.2980 516.1478 NA NA
Involucral.bract.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+007.950632e+031.590126e+037.701819e+002.726092e-06
Residuals 117.000024155.9538 206.4611 NA NA
Spikelet.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+003.605213e+017.210426e+007.372713e+004.849922e-06
Residuals 117.0000000114.4246176 0.9779882 NA NA
Spikelet.width
DfSum SqMean SqF valuePr(>F)
g 5.000000e+001.311291e+012.622581e+007.165786e+006.981943e-06
Residuals 117.0000000 42.8204266 0.3659866 NA NA
Glume.length.
DfSum SqMean SqF valuePr(>F)
g 5.00000000013.775280579 2.755056116 3.892463524 0.002671494
Residuals 117.0000000 82.8117113 0.7077924 NA NA
Glume.width
DfSum SqMean SqF valuePr(>F)
g 5.00000003.57958200.71591643.85271860.0028753
Residuals 117.0000000 21.7410684 0.1858211 NA NA
Nutlet.length
DfSum SqMean SqF valuePr(>F)
g 5.000000e+006.867836e+001.373567e+006.544855e+002.103863e-05
Residuals 117.0000000 24.5547653 0.2098698 NA NA
Nutlet.width
DfSum SqMean SqF valuePr(>F)
g 5.0000000001.5526389330.3105277873.2583688760.008619414
Residuals 117.00000000 11.15028790 0.09530161 NA NA

Tukey's Honest Significant Differences test


In [21]:
hsd_out = lapply(numeric_data, HSD, data$Species)
names(hsd_out) = gsub('.', ' ', names(hsd_out), fixed=TRUE)
hsd_out = hsd_out[sapply(hsd_out, nrow) > 0]

In [22]:
for (i in 1:length(hsd_out)) {
#     cat('', names(hsd_out)[i], '', sep='\n')
    IRdisplay::display_html(names(hsd_out)[i])
    IRdisplay::display_html(repr::repr_html(hsd_out[[i]]))
}


Culm length
difflwruprp adj
F. laevis-F. argyropus276.43 32.29520.56 0.02
Taxon A-F. argyropus-221.07-433.56 -8.57 0.04
F. laevis-F. elatior199.85 3.56396.14 0.04
Taxon A-F. elatior-297.64-452.83-142.46 0.00
F. laevis-F. indica205.94 50.24361.65 0.00
Taxon A-F. indica-291.55-390.53-192.58 0.00
Taxon A-F. laevis-497.50-667.49-327.50 0.00
Taxon B-Taxon A372.46254.90490.02 0.00
Leaf blade length
difflwruprp adj
F. laevis-F. argyropus 97.50 3.27191.73 0.04
F. laevis-F. elatior 75.78 0.02151.54 0.05
Taxon A-F. elatior -99.11-159.00 -39.21 0.00
F. laevis-F. indica101.16 41.06161.25 0.00
Taxon A-F. indica -73.73-111.93 -35.53 0.00
Taxon A-F. laevis-174.89-240.50-109.28 0.00
Taxon B-F. laevis -82.22-147.11 -17.32 0.00
Taxon B-Taxon A 92.67 47.29138.04 0.00
Leaf blade width
difflwruprp adj
F. laevis-F. argyropus1.300.462.150.00
F. laevis-F. elatior1.400.712.080.00
F. laevis-F. indica1.430.881.970.00
Taxon A-F. indica-0.35-0.69 0.00 0.05
Taxon A-F. laevis-1.77-2.36-1.18 0.00
Taxon B-F. laevis-1.42-2.00-0.84 0.00
Leaf sheath length
difflwruprp adj
Taxon A-F. elatior-27.15-53.38 -0.92 0.04
Taxon A-F. indica-33.26-49.99-16.53 0.00
Taxon A-F. laevis-52.06-80.79-23.33 0.00
Taxon B-Taxon A39.7719.9059.63 0.00
Involucral bract length
difflwruprp adj
Taxon A-F. elatior-29.24-45.83-12.66 0.00
Taxon A-F. indica-15.15-25.73 -4.57 0.00
Taxon B-Taxon A22.7610.2035.33 0.00
Spikelet length
difflwruprp adj
Taxon A-F. elatior-1.25-2.39-0.11 0.02
F. laevis-F. indica1.730.592.880.00
Taxon A-F. indica-0.73-1.46 0.00 0.05
Taxon A-F. laevis-2.46-3.71-1.21 0.00
Taxon B-F. laevis-1.45-2.68-0.21 0.01
Taxon B-Taxon A1.010.151.880.01
Spikelet width
difflwruprp adj
Taxon A-F. argyropus-1.15-2.11-0.19 0.01
F. laevis-F. elatior0.980.091.860.02
F. laevis-F. indica0.970.271.670.00
Taxon A-F. laevis-1.40-2.17-0.63 0.00
Taxon B-F. laevis-0.79-1.55-0.03 0.03
Taxon B-Taxon A0.610.081.140.01
Glume length
difflwruprp adj
F. laevis-F. indica1.110.132.080.02
Taxon A-F. laevis-1.54-2.60-0.47 0.00
Glume width
difflwruprp adj
F. laevis-F. elatior0.750.121.380.01
F. laevis-F. indica0.600.101.100.01
Taxon A-F. laevis-0.77-1.31-0.22 0.00
Taxon B-F. laevis-0.62-1.16-0.08 0.01
Nutlet length
difflwruprp adj
Taxon A-F. argyropus-0.84-1.57-0.12 0.01
Taxon A-F. indica-0.59-0.93-0.25 0.00
Taxon A-F. laevis-0.77-1.35-0.19 0.00
Taxon B-Taxon A0.530.120.930.00
Nutlet width
difflwruprp adj
Taxon A-F. indica-0.25-0.48-0.02 0.02

DFA


In [23]:
spp = as.factor(data$Species)
# dat = numeric_data
dat = as.data.frame(scale(numeric_data))  # scaling has no effect except on the loadings
# dat = log(numeric_data)  # log-transforming would alter the meaning of the covariances and doesn't make sense

Regular MANOVA/LDA

MANOVA


In [24]:
maov = manova(as.matrix(dat) ~ spp)

In [25]:
summary(maov, test="Wilks")$stats


DfWilksapprox Fnum Dfden DfPr(>F)
spp5.000000e+001.664462e-014.291268e+005.500000e+014.988656e+027.689601e-19
Residuals117 NA NA NA NA NA

more robust to heteroscedasticity:


In [26]:
summary(maov, test="Pillai")$stats


DfPillaiapprox Fnum Dfden DfPr(>F)
spp5.000000e+001.328982e+003.653113e+005.500000e+015.550000e+024.213774e-15
Residuals117 NA NA NA NA NA

check that DFA groups are still significant if excluding Taxon A:


In [27]:
summary(manova(as.matrix(dat[spp!='Taxon A',]) ~ spp[spp!='Taxon A']),
        test="Pillai")$stats


DfPillaiapprox Fnum Dfden DfPr(>F)
spp[spp != "Taxon A"]4.000000e+009.137655e-012.422455e+004.400000e+013.600000e+024.538724e-06
Residuals97NANANANANA

yup

LDA


In [28]:
LDA = lda(dat, spp)
scores = predict(LDA, dat)$x

Loading of variables on each axis


In [29]:
round(LDA$scaling, 2)


LD1LD2LD3LD4LD5
Culm.length-1.38-0.38 0.32 0.52-0.49
Leaf.blade.length-0.34 0.01 0.26-0.21 0.19
Leaf.blade.width-0.37 0.97-0.07-0.02 0.24
Leaf.sheath.length 0.57 0.16-0.65 0.65 0.25
Involucral.bract.length 0.05-0.75 0.40-0.61 0.05
Spikelet.length-0.26 0.21 1.16 0.34 0.17
Spikelet.width-0.13 0.07-0.96-0.73-0.82
Glume.length.-0.01-0.09 0.40 0.02-0.23
Glume.width 0.14 0.27-0.22-0.04 0.21
Nutlet.length-0.32-0.53-0.86-0.76 1.18
Nutlet.width 0.30-0.18-0.16 0.49-0.88

In [30]:
proportions = LDA$svd ^ 2 / sum(LDA$svd ^ 2)
round(proportions * 100, 1)


  1. 61.6
  2. 21.1
  3. 12.8
  4. 3.1
  5. 1.4

Proportion correctly classified


In [31]:
classif.rate = mean(predict(LDA, dat)$class == spp)
round(classif.rate, 3)


0.699

Cross-validation correct classfication rate


In [32]:
classif.cv = mean(sapply(seq_len(nrow(dat)), function(j) {
  ifelse(predict(lda(dat[-j, ], spp[-j]),
                 dat[j, ])$class == spp[j],
         1, 0)}))
round(classif.cv, 3)


0.585

Testing if taxa are distinct in LDA


In [33]:
for (taxon in c('Taxon A', 'Taxon B', 'F. indica')) {
  cat('-------', '', taxon, '',
      test_lda_predict(dat, spp, taxon, nreps=10000),
      '', sep='\n')
}


-------

Taxon A

20 / 21 correctly identified by LDA
P = 0.000 based on 10000 randomizations

-------

Taxon B

6 / 23 correctly identified by LDA
P = 0.006 based on 10000 randomizations

-------

F. indica

50 / 59 correctly identified by LDA
P = 0.197 based on 10000 randomizations


In [34]:
df = as.data.frame(scores)
names(df) = sub('LD', 'DF axis ', names(df))
df$Species = spp

In [35]:
gp = ggplot(df) + scale_colour_brewer(palette='Set1')
sp = gp + aes(colour=Species) + geom_point(size=3, alpha=1)

In [36]:
sp12 = sp + aes(x=`DF axis 1`, y=`DF axis 2`)
sp34 = sp + aes(x=`DF axis 3`, y=`DF axis 4`)

In [37]:
spbw = gp + aes(shape=Species) + geom_point(size=3) + theme_bw()
spbw12 = spbw + aes(x=`DF axis 1`, y=`DF axis 2`)
spbw34 = spbw + aes(x=`DF axis 3`, y=`DF axis 4`)

In [38]:
plot(sp12)



In [39]:
plot(sp34)



In [40]:
pdf('figures/DFA_plot.pdf', height=5)
plot(sp12)
d = dev.off()

In [41]:
pdf('figures/DFA_plot_bw.pdf', height=5)
plot(spbw12)
# plot(spbw34)
d = dev.off()

In [42]:
sessionInfo()


R version 3.3.1 (2016-06-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.1 LTS

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_ZA.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] reshape2_1.4.1  MASS_7.3-45     Hmisc_3.17-4    Formula_1.2-1  
[5] survival_2.39-4 lattice_0.20-33 ggplot2_2.1.0  

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5         RColorBrewer_1.1-2  plyr_1.8.4         
 [4] base64enc_0.1-3     tools_3.3.1         rpart_4.1-10       
 [7] digest_0.6.9        uuid_0.1-2          jsonlite_0.9.22    
[10] evaluate_0.9        gtable_0.2.0        Matrix_1.2-6       
[13] IRdisplay_0.3       IRkernel_0.6        gridExtra_2.2.1    
[16] repr_0.7            stringr_1.0.0       cluster_2.0.4      
[19] grid_3.3.1          nnet_7.3-12         data.table_1.9.6   
[22] R6_2.1.2            foreign_0.8-66      pbdZMQ_0.2-3       
[25] latticeExtra_0.6-28 magrittr_1.5        scales_0.4.0       
[28] splines_3.3.1       colorspace_1.2-6    labeling_0.3       
[31] stringi_1.1.1       acepack_1.3-3.3     munsell_0.4.3      
[34] chron_2.3-47