In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)

In [ ]:
results = read.delim('2017-01-08_stats.tab', header=F,
                     col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results)
summary(results)

Take the best settings

for mash/kwip, sketch size of 1e6 & 1e8 resp.


In [ ]:
results.all = results
results =  results.all %>%
    filter(sketchsize == 1e6 | sketchsize == 1e8)

In [ ]:
table(results$sketchsize)

All data of top sketch sizes


In [ ]:
ggplot(results, aes(x=cov, y=var)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

In [ ]:
dat = results %>%
        filter(cov==16) %>%
        select(rho, metric, var, seed) 

dat$var.f = as.factor(dat$var)
dat$seed = as.factor(dat$seed)
str(dat)

In [ ]:
ggplot(dat, aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric))

In [ ]:
dat = results %>%
        filter(var==0.01) %>%
        select(rho, metric, cov, seed)
dat.summ = dat %>%
        group_by(cov, metric) %>%
        summarise(rho_av=mean(rho), rho_err=sd(rho))
summary(dat)
summary(dat.summ)

In [ ]:
ggplot(dat.summ, aes(x=cov, y=rho_av)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av-rho_err, ymax=rho_av+rho_err), alpha=0.2) +
    scale_x_log10() +
    theme_bw()

In [ ]:
str(dat)
ggplot(dat, aes(x=cov, y=rho, colour=as.factor(seed), linetype=metric)) +
    geom_line() +
    scale_x_log10()

$\pi$ vs performance


In [ ]:
summ = results %>%
           select(metric, rho, var) %>%
           group_by(var, metric) %>%
           summarise(rho_av=mean(rho), rho_sd=sd(rho))

str(summ)

In [ ]:
p = ggplot(summ, aes(x=var, y=rho_av, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd, group=metric)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric), alpha=0.2) +
    xlab(expression(paste('Mean pairwise variation (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    scale_x_log10()+
    theme_bw()

print(p)

In [ ]: