In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
library(scales)
library(Cairo)

In [ ]:
unlink("plots/", recursive = T)
dir.create("plots")

In [ ]:
results.all = read.delim('2017-03-30_finalsims.tsv', header=F,
                     col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results.all)
table(results.all$sketchsize)

In [ ]:
# Take the best sketch sizes
results =  results.all %>%
    filter(sketchsize == 1e4 & metric %in% c("mashec", "mash")
           | sketchsize==1e7 & metric %in% c("ip", "wip"))
table(results$sketchsize)

In [ ]:
# Remove redundant mash datapoints
#results = results %>% filter(cov > 8 | (cov <= 8 & metric != "mashec"))
results$metric = factor(results$metric, labels=c("IP", "Mash", "Mash (AF)", "WIP"))
results$metric = relevel(results$metric, ref="WIP")

Parameter ranges


In [ ]:
ggplot(results, aes(x=cov, y=var)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

PI

Performance across $\pi$ at both 8x and 32x


In [ ]:
pi.dat = results %>%
        filter(cov==32 | cov==8) %>%
        select(rho, metric, var, cov, seed)  %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var),
               seed = as.factor(seed))

pi.dat.summ = pi.dat %>%
        group_by(cov, metric, var) %>%
        summarise(rho_av=mean(rho),
                  rho_sd=sd(rho),
                  rho_med=median(rho),
                  rho_25=quantile(rho, p=c(1/4)),
                  rho_75=quantile(rho, p=c(3/4))) %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
str(pi.dat)
summary(pi.dat)

In [ ]:
#str(pi.dat.summ)
#summary(pi.dat.summ)

In [ ]:
p = ggplot(pi.dat, aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~cov.f ) + 
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1),
          legend.position = "bottom")

pdf("plots/pi_both_box.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_box.svg", width=7, height = 3)
print(p)
dev.off()

#print(p)

In [ ]:
p = ggplot(pi.dat.summ, aes(x=var, y=rho_med)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av - rho_sd, ymax=rho_av + rho_sd), alpha=0.2) +
    scale_x_log10() +
    facet_wrap(~cov) + 
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("spearman's ", rho))) +
    ylim(0, 1) +
    theme_bw()

pdf("plots/pi_both_avgsd.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_avgsd.svg", width=7, height = 3)
print(p)
dev.off()
#print(p)

In [ ]:
p = ggplot(filter(pi.dat, cov==8), aes(x=var.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_bw() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))

pdf("plots/pi_8x_box.pdf", width=4, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_box.svg", width=4, height = 3)
print(p)
dev.off()

# print(p)

In [ ]:
p = ggplot(filter(pi.dat.summ, cov==8), aes(x=var, y=rho_av, fill=metric)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av - rho_sd, ymax=rho_av + rho_sd), alpha=0.2) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    scale_linetype_manual(guide = guide_legend(title = "Metric"),
                          values=c("solid", "dotdash", "dotted", "dashed")) +
    xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    scale_x_continuous(trans="log10",
                       breaks=c(0.001, 0.005, 0.01, 0.05, 0.1),
                       #minor_breaks=c(0.001, 0.002, 0.005, 0.01,0.02, 0.05, 0.1, 0.2),
                       #labels = trans_format("log10", math_format(10^.x)) 
                       ) +
    theme_bw() + 
    theme(
          #  panel.grid.minor = element_blank()
          #, axis.text.x=element_text(angle = 30, hjust = 1, vjust=1)
          )

pdf("plots/pi_8x_avgsd.pdf", width=4.5, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_avgsd.svg", width=4.5, height = 3)
print(p)
dev.off()
print(p)

Coverage


In [ ]:
whichcov = 0.005
cov.dat = results %>%
        filter(var %in% c(0.001, 0.005, 0.01)) %>%
        select(rho, metric, cov, var, seed) %>% 
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
cov.dat.summ = cov.dat %>%
        group_by(cov, metric, var) %>%
        summarise(rho_av=mean(rho),
                  rho_sd=sd(rho),
                  rho_med=median(rho),
                  rho_25=quantile(rho, p=c(1/4)),
                  rho_75=quantile(rho, p=c(3/4))) %>%
        mutate(cov.f = as.factor(cov),
               var.f = as.factor(var))

In [ ]:
p = ggplot(cov.dat, aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    facet_wrap(~var.f ) + 
    ylim(0, 1) +
    theme_classic() +
    theme(legend.position = "bottom")

pdf("plots/cov_all_box.pdf", width=7, height = 3)
print(p)
dev.off()
#kprint(p)

In [ ]:
p = ggplot(filter(cov.dat, var==whichcov), aes(x=cov.f, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric)) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, plusminus,  " +- SD"))) +
    ylim(0, 1) +
    theme_classic() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1)) +
    theme(legend.position = "bottom")

pdf("plots/cov_200_box.pdf", width=7, height = 3)
print(p)
dev.off()
#print(p)

In [ ]:
p = ggplot(cov.dat.summ, aes(x=cov, y=rho_med)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_25, ymax=rho_75), alpha=0.2) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    scale_linetype_manual(guide = guide_legend(title = "Metric"),
                          values=c("solid", "dotdash", "dotted", "dashed")) +
    scale_x_log10() +
    facet_wrap(~var) + 
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho))) +
    ylim(0, 1) +
    theme_bw()

pdf("plots/cov_all_avgsd.pdf", width=10, height = 3)
print(p)
dev.off()

#print(p)

In [ ]:
p = ggplot(filter(cov.dat.summ, var==whichcov), aes(x=cov, y=rho_av)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd), alpha=0.2) +
    scale_fill_discrete(guide = guide_legend(title = "Metric")) +
    scale_linetype_manual(guide = guide_legend(title = "Metric"),
                          values=c("solid", "dotdash", "dotted", "dashed")) +
    scale_x_log10(breaks=c(1,5,10,50,100)) +
    xlab(expression(paste('Mean sequencing depth'))) +
    ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
    ylim(0, 1) +
    theme_bw() +
    theme(
        #panel.grid.minor=element_blank()
    )

pdf("plots/cov_200_avgsd.pdf", width=4.5, height = 3)
print(p)
dev.off()
svg("plots/cov_200_avgsd.svg", width=4.5, height = 3)
print(p)
dev.off()

print(p)

In [ ]: