In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)

In [ ]:
if (!dir.exists("plots/")) { dir.create("plots")}

In [ ]:
results.all = read.delim('2017-01-23.tsv', header=F,
                     col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results.all)

In [ ]:
table(results.all$sketchsize)

In [ ]:
# Take the best sketch sizes
results =  results.all %>%
    filter(sketchsize == 1e4 & metric %in% c("mashec", "mash")
           | sketchsize==1e7 & metric %in% c("ip", "wip"))
table(results$sketchsize)

In [ ]:
# Remove redundant mash datapoints
results = results %>% filter(cov > 8 | (cov <= 8 & metric != "mashec"))

Parameter ranges


In [ ]:
ggplot(results, aes(x=cov, y=var)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

In [ ]:
ggplot(results, aes(x=cov, y=var, color=metric, shape=metric)) +
    geom_point(position = position_dodge(width=0.15), size=2) +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

PI

Performance across $\pi$ at both 8x and 32x


In [ ]:
dat = results %>%
        filter(cov==32 | cov==8) %>%
        select(rho, metric, var, cov, seed) 
dat$metric = factor(dat$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))

dat$cov.f = as.factor(dat$cov)
dat$var.f = as.factor(dat$var)
dat$seed = as.factor(dat$seed)
str(dat)

In [ ]:
p = ggplot(dat, aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~cov.f ) + 
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1),
          legend.position = "bottom")

pdf("plots/pi_both_box.pdf", width=8, height = 4)
print(p)
dev.off()
print(p)

In [ ]:
dat = dat %>%
        filter(cov==8)

p = ggplot(dat, aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))

pdf("plots/pi_8x_box.pdf", width=4, height = 3)
print(p)
dev.off()
print(p)

Coverage


In [ ]:
dat = results %>%
        filter(var==0.01 | var==0.005 | var==0.002) %>%
        select(rho, metric, cov, var, seed)

dat$metric = factor(dat$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))
dat$cov.f = as.factor(dat$cov)
dat$var.f = as.factor(dat$var)

In [ ]:
p = ggplot(dat, aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~var.f ) + 
    ylim(0, 1) +
    theme_classic() +
    theme(legend.position = "bottom")
print(p)
pdf("plots/cov_all_box.pdf", width=12, height = 4)
print(p)
dev.off()

In [ ]:
dat = dat %>%
        filter(var==0.005)

p = ggplot(dat, aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1)) +
    theme(legend.position = "bottom")
print(p)
pdf("plots/cov_200_box.pdf", width=8, height = 3)
print(p)
dev.off()

In [ ]:
dat.summ = results %>%
        filter(var==0.005) %>%
        select(rho, metric, cov, seed) %>%
        group_by(cov, metric) %>%
        summarise(rho_av=mean(rho), rho_err=sd(rho))
dat.summ$metric = factor(dat.summ$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))

In [ ]:
p = ggplot(dat.summ, aes(x=cov, y=rho_av)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av-rho_err, ymax=rho_av+rho_err), alpha=0.2) +
    scale_x_log10() +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_bw()

print(p)
pdf("plots/coverage.pdf", width=4, height = 3)
print(p)
dev.off()

In [ ]:
str(dat)
ggplot(dat, aes(x=cov, y=rho, colour=as.factor(seed), linetype=metric)) +
    geom_line() +
    scale_x_log10()

$\pi$ vs performance

summ = results %>% filter(cov==8, metric!="mashec") %>% select(metric, rho, var, seed) %>% group_by(var, metric) %>% summarise(rho_av=mean(rho), rho_sd=sd(rho)) summ$metric = factor(as.character(summ$metric), levels="IP", "Mash", "WIP")
p = ggplot(summ, aes(x=var, y=rho_av, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd, group=metric)) + geom_line(aes(linetype=metric)) + geom_ribbon(aes(fill=metric), alpha=0.2) + xlab(expression(paste('Mean pairwise variation (', pi, ')'))) + ylab(expression(paste("Spearman's ", rho, " +- SD"))) + scale_x_log10()+ theme_bw() print(p) pdf("plots/pi.pdf", width=5, height = 4) print(p) dev.off()