In [ ]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np
from kwipexpt import *
%matplotlib inline
%load_ext rpy2.ipython
In [ ]:
%%R
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
In [ ]:
expts = list(map(lambda fp: path.basename(fp.rstrip('/')), glob('data/*/')))
print("Expts:", *expts[:10], "...")
In [ ]:
def process_expt(expt):
expt_results = []
def extract_info(filename):
return re.search(r'kwip/(\d\.?\d*)x-(0\.\d+)-(wip|ip).dist', filename).groups()
# dict of scale: distance matrix, populated as we go
truths = {}
for distfile in glob("data/{}/kwip/*.dist".format(expt)):
cov, scale, metric = extract_info(distfile)
if scale not in truths:
genome_dist_path = 'data/{ex}/all_genomes-{sc}.dist'.format(ex=expt, sc=scale)
truths[scale] = load_sample_matrix_to_runs(genome_dist_path)
exptmat = DistanceMatrix.read(distfile)
rho = spearmans_rho_distmats(exptmat, truths[scale])
expt_results.append({
"coverage": cov,
"scale": scale,
"metric": metric,
"rho": rho,
"seed": expt,
})
return expt_results
#process_expt('3662')
In [ ]:
results = []
for res in map(process_expt, expts):
results.extend(res)
results = pd.DataFrame(results)
In [ ]:
%%R -i results
results$coverage = as.numeric(as.character(results$coverage))
results$scale = as.numeric(as.character(results$scale))
print(summary(results))
str(results)
Below we see the design of the experiment in terms of the two major variables.
We have a series (vertically) that, at 30x coverage, looks at the effect of genetic variation on performance. There is a second series that examines the effect of coverage at an average pairwise genetic distance of 0.001.
There are 100 replicates for each data point, performed as a separate bootstrap across the random creation of the tree and sampling of reads etc.
In [ ]:
%%R
ggplot(results, aes(x=coverage, y=scale)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_bw()
In [ ]:
%%R
dat = results %>%
filter(scale==0.001, coverage<=30) %>%
select(rho, metric, coverage)
dat$coverage = as.factor(dat$coverage)
ggplot(dat, aes(x=coverage, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric))
In [ ]:
%%R
# AND AGAIN WITHOUT SUBSETTING
dat = results %>%
filter(scale==0.001) %>%
select(rho, metric, coverage)
dat$coverage = as.factor(dat$coverage)
ggplot(dat, aes(x=coverage, y=rho, fill=metric)) +
geom_boxplot(aes(fill=metric))
In [ ]:
%%R
dat = subset(results, scale==0.001 & coverage <=15, select=-scale)
ggplot(dat, aes(x=coverage, y=rho, colour=seed, linetype=metric)) +
geom_line()
In [ ]:
%%R
summ = results %>%
filter(scale==0.001, coverage <= 50) %>%
select(-scale) %>%
group_by(coverage, metric) %>%
summarise(rho_av=mean(rho), rho_err=sd(rho))
ggplot(summ, aes(x=coverage, y=rho_av, ymin=rho_av-rho_err, ymax=rho_av+rho_err, group=metric)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric), alpha=0.2) +
xlab('Genome Coverage') +
ylab(expression(paste("Spearman's ", rho, " +- SD"))) +
scale_x_log10()+
ggtitle("Performance of WIP & IP") +
theme_bw()
In [ ]:
%%R
sem <- function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))
summ = results %>%
filter(scale==0.001) %>%
select(-scale) %>%
group_by(coverage, metric) %>%
summarise(rho_av=mean(rho), rho_err=sem(rho))
ggplot(summ, aes(x=coverage, y=rho_av, ymin=rho_av-rho_err, ymax=rho_av+rho_err, group=metric)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric), alpha=0.2) +
xlab('Genome Coverage') +
ylab(expression(paste("Spearman's ", rho))) +
scale_x_log10()+
theme_bw()
In [ ]:
%%R
cov_diff = results %>%
filter(scale==0.001) %>%
select(rho, metric, coverage, seed) %>%
spread(metric, rho) %>%
mutate(diff=wip-ip) %>%
select(coverage, seed, diff)
print(summary(cov_diff))
p = ggplot(cov_diff, aes(x=coverage, y=diff, colour=seed)) +
geom_line() +
scale_x_log10() +
ggtitle("Per expt difference in performance (wip - ip)")
print(p)
summ = cov_diff %>%
group_by(coverage) %>%
summarise(diff_av=mean(diff), diff_sd=sd(diff))
ggplot(summ, aes(x=coverage, y=diff_av, ymin=diff_av-diff_sd, ymax=diff_av+diff_sd)) +
geom_line() +
geom_ribbon(alpha=0.2) +
xlab('Genome Coverage') +
ylab(expression(paste("Improvment in Spearman's ", rho, " (wip - IP)"))) +
scale_x_log10() +
theme_bw()
In [ ]:
%%R
var = results %>%
filter(coverage == 30) %>%
select(-coverage)
var$scale = as.factor(var$scale)
ggplot(var, aes(x=scale, y=rho, fill=metric)) +
geom_boxplot() +
xlab('Mean pairwise variation') +
ylab(expression(paste("Spearman's ", rho))) +
#scale_x_log10()+
theme_bw()
In [ ]:
%%R
summ = results %>%
filter(coverage == 30) %>%
select(-coverage) %>%
group_by(scale, metric) %>%
summarise(rho_av=mean(rho), rho_sd=sd(rho))
ggplot(summ, aes(x=scale, y=rho_av, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd, group=metric)) +
geom_line(aes(linetype=metric)) +
geom_ribbon(aes(fill=metric), alpha=0.2) +
xlab('Mean pairwise variation') +
ylab(expression(paste("Spearman's ", rho))) +
scale_x_log10()+
theme_bw()
In [ ]: