In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
library(Cairo)

In [ ]:
unlink("plots/", recursive = T)
dir.create("plots")

In [ ]:
results.all = read.delim('2017-01-29.tab', header=F,
                     col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results.all)

In [ ]:
# Take the best sketch sizes
results =  results.all %>%
    filter(sketchsize == 1e5 & metric %in% c("mashec", "mash")
           | sketchsize==1e7 & metric %in% c("ip", "wip"))
table(results$sketchsize)

In [ ]:
# Remove redundant mash datapoints
#results = results %>% filter(cov > 8 | (cov <= 8 & metric != "mashec"))
results$metric = factor(results$metric, labels=c("IP", "Mash (min. abund. 1)", "Mash (min. abund. 2)", "WIP"))

Parameter ranges


In [ ]:
ggplot(results, aes(x=cov, y=var)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

PI

Performance across $\pi$ at both 8x and 32x


In [ ]:
pi.dat = results %>%
        filter(cov==32 | cov==8) %>%
        select(rho, metric, var, cov, seed)  %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var),
               seed = as.factor(seed))

pi.dat.summ = pi.dat %>%
        group_by(cov, metric, var) %>%
        summarise(rho_av=mean(rho),
                  rho_sd=sd(rho),
                  rho_med=median(rho),
                  rho_25=quantile(rho, p=c(1/4)),
                  rho_75=quantile(rho, p=c(3/4))) %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
str(pi.dat)
summary(pi.dat)

In [ ]:
str(pi.dat.summ)
summary(pi.dat.summ)

In [ ]:
p = ggplot(pi.dat, aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~cov.f ) + 
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1),
          legend.position = "bottom")

pdf("plots/pi_both_box.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_box.svg", width=7, height = 3)
print(p)
dev.off()

# print(p)

In [ ]:
p = ggplot(pi.dat.summ, aes(x=var, y=rho_med)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_25, ymax=rho_75), alpha=0.2) +
    scale_x_log10() +
    facet_wrap(~cov) + 
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("spearman's ", rho))) +
    ylim(0, 1) +
    theme_bw()

pdf("plots/pi_both_quartline.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_quartline.svg", width=7, height = 3)
print(p)
dev.off()
# print(p)

In [ ]:
p = ggplot(filter(pi.dat, cov==8), aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_bw() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))

pdf("plots/pi_8x_box.pdf", width=4, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_box.svg", width=4, height = 3)
print(p)
dev.off()

# print(p)

In [ ]:
p = ggplot(filter(pi.dat.summ, cov==8), aes(x=var, y=rho_av, fill=metric)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av - rho_sd, ymax=rho_av + rho_sd), alpha=0.2) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    scale_x_log10() +
    theme_bw()# + 
    #theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))

pdf("plots/pi_8x_avgsd.pdf", width=6, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_avgsd.svg", width=6, height = 3)
print(p)
dev.off()
# print(p)

Coverage


In [ ]:
cov.dat = results %>%
        filter(var %in% c(0.002, 0.005, 0.01)) %>%
        select(rho, metric, cov, var, seed) %>% 
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
cov.dat.summ = cov.dat %>%
        group_by(cov, metric, var) %>%
        summarise(rho_av=mean(rho),
                  rho_sd=sd(rho),
                  rho_med=median(rho),
                  rho_25=quantile(rho, p=c(1/4)),
                  rho_75=quantile(rho, p=c(3/4))) %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
p = ggplot(cov.dat, aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~var.f ) + 
    ylim(0, 1) +
    theme_classic() +
    theme(legend.position = "bottom")

pdf("plots/cov_all_box.pdf", width=7, height = 3)
print(p)
dev.off()
# print(p)

In [ ]:
p = ggplot(filter(cov.dat, var==0.002), aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1)) +
    theme(legend.position = "bottom")

pdf("plots/cov_500_box.pdf", width=7, height = 3)
print(p)
dev.off()
# print(p)

In [ ]:
p = ggplot(cov.dat.summ, aes(x=cov, y=rho_med)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_25, ymax=rho_75), alpha=0.2) +
    scale_x_log10() +
    facet_wrap(~var) + 
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho))) +
    ylim(0, 1) +
    theme_bw()

pdf("plots/cov_all_avgsd.pdf", width=7, height = 3)
print(p)
dev.off()

# print(p)

In [ ]:
p = ggplot(filter(cov.dat.summ, var==0.002), aes(x=cov, y=rho_av)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd), alpha=0.2) +
    scale_x_log10() +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_bw()

pdf("plots/cov_500_avgsd.pdf", width=6, height = 3)
print(p)
dev.off()
svg("plots/cov_500_avgsd.svg", width=6, height = 3)
print(p)
dev.off()

# print(p)

In [ ]: