In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
In [ ]:
if (!dir.exists("plots/")) { dir.create("plots")}
In [ ]:
results.all = read.delim('2017-01-23.tsv', header=F,
col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results.all)
In [ ]:
table(results.all$sketchsize)
In [ ]:
# Take the best sketch sizes
results = results.all %>%
filter(sketchsize == 1e4 & metric %in% c("mashec", "mash")
| sketchsize==1e7 & metric %in% c("ip", "wip"))
table(results$sketchsize)
In [ ]:
# Remove redundant mash datapoints
results = results %>% filter(cov > 8 | (cov <= 8 & metric != "mashec"))
In [ ]:
ggplot(results, aes(x=cov, y=var)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_bw()
In [ ]:
ggplot(results, aes(x=cov, y=var, color=metric, shape=metric)) +
geom_point(position = position_dodge(width=0.15), size=2) +
scale_x_log10() +
scale_y_log10() +
theme_bw()
In [ ]:
dat = results %>%
filter(cov==32 | cov==8) %>%
select(rho, metric, var, cov, seed)
dat$metric = factor(dat$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))
dat$cov.f = as.factor(dat$cov)
dat$var.f = as.factor(dat$var)
dat$seed = as.factor(dat$seed)
str(dat)
In [ ]:
p = ggplot(dat, aes(x=var.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
facet_wrap(~cov.f ) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1),
legend.position = "bottom")
pdf("plots/pi_both_box.pdf", width=8, height = 4)
print(p)
dev.off()
print(p)
In [ ]:
dat = dat %>%
filter(cov==8)
p = ggplot(dat, aes(x=var.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))
pdf("plots/pi_8x_box.pdf", width=4, height = 3)
print(p)
dev.off()
print(p)
In [ ]:
dat = results %>%
filter(var==0.01 | var==0.005 | var==0.002) %>%
select(rho, metric, cov, var, seed)
dat$metric = factor(dat$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))
dat$cov.f = as.factor(dat$cov)
dat$var.f = as.factor(dat$var)
In [ ]:
p = ggplot(dat, aes(x=cov.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
facet_wrap(~var.f ) +
ylim(0, 1) +
theme_classic() +
theme(legend.position = "bottom")
print(p)
pdf("plots/cov_all_box.pdf", width=12, height = 4)
print(p)
dev.off()
In [ ]:
dat = dat %>%
filter(var==0.005)
p = ggplot(dat, aes(x=cov.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1)) +
theme(legend.position = "bottom")
print(p)
pdf("plots/cov_200_box.pdf", width=8, height = 3)
print(p)
dev.off()
In [ ]:
dat.summ = results %>%
filter(var==0.005) %>%
select(rho, metric, cov, seed) %>%
group_by(cov, metric) %>%
summarise(rho_av=mean(rho), rho_err=sd(rho))
dat.summ$metric = factor(dat.summ$metric, labels=c("IP", "Mash (without EC)", "Mash (with EC)", "WIP"))
In [ ]:
p = ggplot(dat.summ, aes(x=cov, y=rho_av)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric, ymin=rho_av-rho_err, ymax=rho_av+rho_err), alpha=0.2) +
scale_x_log10() +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
theme_bw()
print(p)
pdf("plots/coverage.pdf", width=4, height = 3)
print(p)
dev.off()
In [ ]:
str(dat)
ggplot(dat, aes(x=cov, y=rho, colour=as.factor(seed), linetype=metric)) +
geom_line() +
scale_x_log10()