In [ ]:
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
library(scales)
library(Cairo)
In [ ]:
unlink("plots/", recursive = T)
dir.create("plots")
In [ ]:
results.all = read.delim('2017-03-30_finalsims.tsv', header=F,
col.names=c("seed", "metric", "sketchsize", "cov", "var", "rho"))
str(results.all)
table(results.all$sketchsize)
In [ ]:
# Take the best sketch sizes
results = results.all %>%
filter(sketchsize == 1e4 & metric %in% c("mashec", "mash")
| sketchsize==1e7 & metric %in% c("ip", "wip"))
table(results$sketchsize)
In [ ]:
# Remove redundant mash datapoints
#results = results %>% filter(cov > 8 | (cov <= 8 & metric != "mashec"))
results$metric = factor(results$metric, labels=c("IP", "Mash", "Mash (AF)", "WIP"))
results$metric = relevel(results$metric, ref="WIP")
In [ ]:
ggplot(results, aes(x=cov, y=var)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_bw()
In [ ]:
pi.dat = results %>%
filter(cov==32 | cov==8) %>%
select(rho, metric, var, cov, seed) %>%
mutate(cov.f = as.factor(cov),
var.f = as.factor(var),
seed = as.factor(seed))
pi.dat.summ = pi.dat %>%
group_by(cov, metric, var) %>%
summarise(rho_av=mean(rho),
rho_sd=sd(rho),
rho_med=median(rho),
rho_25=quantile(rho, p=c(1/4)),
rho_75=quantile(rho, p=c(3/4))) %>%
mutate(cov.f = as.factor(cov),
var.f = as.factor(var))
In [ ]:
str(pi.dat)
summary(pi.dat)
In [ ]:
#str(pi.dat.summ)
#summary(pi.dat.summ)
In [ ]:
p = ggplot(pi.dat, aes(x=var.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
facet_wrap(~cov.f ) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1),
legend.position = "bottom")
pdf("plots/pi_both_box.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_box.svg", width=7, height = 3)
print(p)
dev.off()
#print(p)
In [ ]:
p = ggplot(pi.dat.summ, aes(x=var, y=rho_med)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric, ymin=rho_av - rho_sd, ymax=rho_av + rho_sd), alpha=0.2) +
scale_x_log10() +
facet_wrap(~cov) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("spearman's ", rho))) +
ylim(0, 1) +
theme_bw()
pdf("plots/pi_both_avgsd.pdf", width=7, height = 3)
print(p)
dev.off()
svg("plots/pi_both_avgsd.svg", width=7, height = 3)
print(p)
dev.off()
#print(p)
In [ ]:
p = ggplot(filter(pi.dat, cov==8), aes(x=var.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
theme_bw() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1))
pdf("plots/pi_8x_box.pdf", width=4, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_box.svg", width=4, height = 3)
print(p)
dev.off()
# print(p)
In [ ]:
p = ggplot(filter(pi.dat.summ, cov==8), aes(x=var, y=rho_av, fill=metric)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric, ymin=rho_av - rho_sd, ymax=rho_av + rho_sd), alpha=0.2) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
scale_linetype_manual(guide = guide_legend(title = "Metric"),
values=c("solid", "dotdash", "dotted", "dashed")) +
xlab(expression(paste('Mean pairwise nucleotide divergence (', pi, ')'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
scale_x_continuous(trans="log10",
breaks=c(0.001, 0.005, 0.01, 0.05, 0.1),
#minor_breaks=c(0.001, 0.002, 0.005, 0.01,0.02, 0.05, 0.1, 0.2),
#labels = trans_format("log10", math_format(10^.x))
) +
theme_bw() +
theme(
# panel.grid.minor = element_blank()
#, axis.text.x=element_text(angle = 30, hjust = 1, vjust=1)
)
pdf("plots/pi_8x_avgsd.pdf", width=4.5, height = 3)
print(p)
dev.off()
svg("plots/pi_8x_avgsd.svg", width=4.5, height = 3)
print(p)
dev.off()
print(p)
In [ ]:
whichcov = 0.005
cov.dat = results %>%
filter(var %in% c(0.001, 0.005, 0.01)) %>%
select(rho, metric, cov, var, seed) %>%
mutate(cov.f = as.factor(cov),
var.f = as.factor(var))
In [ ]:
cov.dat.summ = cov.dat %>%
group_by(cov, metric, var) %>%
summarise(rho_av=mean(rho),
rho_sd=sd(rho),
rho_med=median(rho),
rho_25=quantile(rho, p=c(1/4)),
rho_75=quantile(rho, p=c(3/4))) %>%
mutate(cov.f = as.factor(cov),
var.f = as.factor(var))
In [ ]:
p = ggplot(cov.dat, aes(x=cov.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
facet_wrap(~var.f ) +
ylim(0, 1) +
theme_classic() +
theme(legend.position = "bottom")
pdf("plots/cov_all_box.pdf", width=7, height = 3)
print(p)
dev.off()
#kprint(p)
In [ ]:
p = ggplot(filter(cov.dat, var==whichcov), aes(x=cov.f, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric)) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, plusminus, " +- SD"))) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust=1)) +
theme(legend.position = "bottom")
pdf("plots/cov_200_box.pdf", width=7, height = 3)
print(p)
dev.off()
#print(p)
In [ ]:
p = ggplot(cov.dat.summ, aes(x=cov, y=rho_med)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric, ymin=rho_25, ymax=rho_75), alpha=0.2) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
scale_linetype_manual(guide = guide_legend(title = "Metric"),
values=c("solid", "dotdash", "dotted", "dashed")) +
scale_x_log10() +
facet_wrap(~var) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho))) +
ylim(0, 1) +
theme_bw()
pdf("plots/cov_all_avgsd.pdf", width=10, height = 3)
print(p)
dev.off()
#print(p)
In [ ]:
p = ggplot(filter(cov.dat.summ, var==whichcov), aes(x=cov, y=rho_av)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd), alpha=0.2) +
scale_fill_discrete(guide = guide_legend(title = "Metric")) +
scale_linetype_manual(guide = guide_legend(title = "Metric"),
values=c("solid", "dotdash", "dotted", "dashed")) +
scale_x_log10(breaks=c(1,5,10,50,100)) +
xlab(expression(paste('Mean sequencing depth'))) +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
ylim(0, 1) +
theme_bw() +
theme(
#panel.grid.minor=element_blank()
)
pdf("plots/cov_200_avgsd.pdf", width=4.5, height = 3)
print(p)
dev.off()
svg("plots/cov_200_avgsd.svg", width=4.5, height = 3)
print(p)
dev.off()
print(p)
In [ ]: