In [1]:
required_packages = c('reshape2',
'RColorBrewer',
'ggplot2',
'Hmisc'
)
packages_to_install = required_packages[! required_packages %in%
rownames(installed.packages())]
packages_to_install
In [2]:
if (length(packages_to_install) >= 1L) {
install.packages(packages_to_install,
repos='http://cloud.r-project.org/')
}
In [3]:
require(ggplot2)
require(Hmisc)
require(MASS)
require(reshape2)
In [4]:
source('stats_fns.R')
In [5]:
data = read.csv('DATASET_F_indica_complex_November_2015.csv',
as.is=TRUE, header=TRUE)
names(data) = sub('..mm.', '', names(data))
names(data) = sub('Taxon', 'Species', names(data), fixed=TRUE)
In [6]:
data = data[names(data) != 'Collector.number']
numeric_data = data[, sapply(data, class) == 'numeric']
In [7]:
dim(data)
head(data)
In [8]:
# num = sapply(data, class) == 'numeric'
# data[,num] = log(data[,num])
# data[,num] = scale(data[,num])
names(data)[2:ncol(data)] = paste(gsub('.', ' ', sub('\\.$', '', names(data)[2:ncol(data)]), fixed=TRUE), '(mm)')
# longdata = melt(data)
dropcols = names(data) %in% c("Glume length (mm)", "Glume width (mm)", "Leaf blade length (mm)")
longdata = melt(data[, !dropcols])
longdata$Species = factor(longdata$Species)
levels(longdata$Species) = substr(gsub('F. |Taxon ', '', levels(longdata$Species)), 1, 3)
In [9]:
gp = ggplot(longdata) #+ scale_colour_brewer(palette='Set2')
gp = gp + facet_wrap(~ variable, nrow=3, scales='free')
In [10]:
# stripplot = gp + aes(y=Species, x=value) + geom_point(size=3, alpha=0.7) +
# theme_bw()
# plot(stripplot)
In [11]:
vstripplot = gp + aes(x=Species, y=value) + ylab('') +
geom_point(size=1.4, alpha=1, colour='grey20',
position=position_jitter(width=0.1, height=0)) +
stat_summary(fun.data='median_hilow', geom='crossbar', colour='black',
width=0.6) +
xlab('') +
theme_bw(base_size=11)
In [12]:
if (!dir.exists('figures')) {
dir.create('figures')
}
In [13]:
pdf('figures/univariate.pdf', width=7, height=7, useDingbats=FALSE)
# plot(stripplot)
plot(vstripplot)
dev.off()
In [14]:
plot(vstripplot)
In [15]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, length))
In [16]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, mean))
In [17]:
apply(numeric_data, 2, function(x) tapply(x, data$Species, sd))
In [18]:
F_out = lapply(numeric_data, F_stats, data$Species)
In [19]:
t(sapply(F_out, function(x) x[1, 4:5]))
In [20]:
for (i in 1:length(F_out)) {
IRdisplay::display_html(names(F_out)[i])
IRdisplay::display_html(repr::repr_html(F_out[[i]]))
}
In [21]:
hsd_out = lapply(numeric_data, HSD, data$Species)
names(hsd_out) = gsub('.', ' ', names(hsd_out), fixed=TRUE)
hsd_out = hsd_out[sapply(hsd_out, nrow) > 0]
In [22]:
for (i in 1:length(hsd_out)) {
# cat('', names(hsd_out)[i], '', sep='\n')
IRdisplay::display_html(names(hsd_out)[i])
IRdisplay::display_html(repr::repr_html(hsd_out[[i]]))
}
In [23]:
spp = as.factor(data$Species)
# dat = numeric_data
dat = as.data.frame(scale(numeric_data)) # scaling has no effect except on the loadings
# dat = log(numeric_data) # log-transforming would alter the meaning of the covariances and doesn't make sense
In [24]:
maov = manova(as.matrix(dat) ~ spp)
In [25]:
summary(maov, test="Wilks")$stats
more robust to heteroscedasticity:
In [26]:
summary(maov, test="Pillai")$stats
check that DFA groups are still significant if excluding Taxon A:
In [27]:
summary(manova(as.matrix(dat[spp!='Taxon A',]) ~ spp[spp!='Taxon A']),
test="Pillai")$stats
yup
In [28]:
LDA = lda(dat, spp)
scores = predict(LDA, dat)$x
In [29]:
round(LDA$scaling, 2)
In [30]:
proportions = LDA$svd ^ 2 / sum(LDA$svd ^ 2)
round(proportions * 100, 1)
In [31]:
classif.rate = mean(predict(LDA, dat)$class == spp)
round(classif.rate, 3)
In [32]:
classif.cv = mean(sapply(seq_len(nrow(dat)), function(j) {
ifelse(predict(lda(dat[-j, ], spp[-j]),
dat[j, ])$class == spp[j],
1, 0)}))
round(classif.cv, 3)
In [33]:
for (taxon in c('Taxon A', 'Taxon B', 'F. indica')) {
cat('-------', '', taxon, '',
test_lda_predict(dat, spp, taxon, nreps=10000),
'', sep='\n')
}
In [34]:
df = as.data.frame(scores)
names(df) = sub('LD', 'DF axis ', names(df))
df$Species = spp
In [35]:
gp = ggplot(df) + scale_colour_brewer(palette='Set1')
sp = gp + aes(colour=Species) + geom_point(size=3, alpha=1)
In [36]:
sp12 = sp + aes(x=`DF axis 1`, y=`DF axis 2`)
sp34 = sp + aes(x=`DF axis 3`, y=`DF axis 4`)
In [37]:
spbw = gp + aes(shape=Species) + geom_point(size=3) + theme_bw()
spbw12 = spbw + aes(x=`DF axis 1`, y=`DF axis 2`)
spbw34 = spbw + aes(x=`DF axis 3`, y=`DF axis 4`)
In [38]:
plot(sp12)
In [39]:
plot(sp34)
In [40]:
pdf('figures/DFA_plot.pdf', height=5)
plot(sp12)
d = dev.off()
In [41]:
pdf('figures/DFA_plot_bw.pdf', height=5)
plot(spbw12)
# plot(spbw34)
d = dev.off()
In [42]:
sessionInfo()