In [ ]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np
import scipy as sp
from kwipexpt import *
%matplotlib inline
%load_ext rpy2.ipython
In [ ]:
%%R
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
library(reshape2)
In [ ]:
expts = list(map(lambda fp: path.basename(fp.rstrip('/')), glob('data/*/')))
print("Number of replicate experiments:", len(expts))
In [ ]:
def process_expt(expt):
expt_results = []
def extract_info(filename):
return re.search(r'kwip/(\d\.?\d*)x-(0\.\d+)-(wip|ip).dist', filename).groups()
def r_sqrt(truth, dist):
return sp.stats.pearsonr(truth, np.sqrt(dist))[0]
def rho_sqrt(truth, dist):
return sp.stats.spearmanr(truth, np.sqrt(dist)).correlation
# dict of scale: distance matrix, populated as we go
truths = {}
truth_points = []
sim_points = []
for distfile in glob("data/{}/kwip/*.dist".format(expt)):
cov, scale, metric = extract_info(distfile)
if scale not in truths:
genome_dist_path = 'data/{ex}/all_genomes-{sc}.dist'.format(ex=expt, sc=scale)
truths[scale] = load_sample_matrix_to_runs(genome_dist_path)
exptmat = DistanceMatrix.read(distfile)
rho = distmat_corr(truths[scale], exptmat, stats.spearmanr).correlation
rho2 = distmat_corr(truths[scale], exptmat, rho_sqrt)
r = distmat_corr(truths[scale], exptmat, stats.pearsonr)[0]
r2 = distmat_corr(truths[scale], exptmat, r_sqrt)
if cov == "100" and scale == "0.001" and metric == "wip":
truth_points.append(truths[scale].condensed_form())
sim_points.append(exptmat.condensed_form())
expt_results.append({
"coverage": cov,
"scale": scale,
"metric": metric,
"rho": rho,
"rhosqrt": rho2,
"r": r,
"rsqrt": r2,
"seed": expt,
})
return expt_results, (truth_points, sim_points)
#process_expt('3662')
In [ ]:
results = []
truepoints = []
simpoints = []
for res in map(process_expt, expts):
results.extend(res[0])
truepoints.extend(res[1][0])
simpoints.extend(res[1][1])
results = pd.DataFrame(results)
In [ ]:
truepoints = np.concatenate(truepoints)
simpoints = np.concatenate(simpoints)
In [ ]:
%%R -i truepoints -i simpoints
plot(truepoints, sqrt(simpoints), pch=".")
In [ ]:
%%R -i results
results$coverage = as.numeric(as.character(results$coverage))
results$scale = as.numeric(as.character(results$scale))
print(summary(results))
str(results)
In [ ]:
%%R
# AND AGAIN WITHOUT SUBSETTING
dat = results %>%
filter(scale==0.001, metric=="wip") %>%
select(coverage, rho, r, rsqrt, rhosqrt)
mdat = melt(dat, id.vars="coverage", variable.name="measure", value.name="corr")
mdat$coverage = as.factor(mdat$coverage)
ggplot(mdat, aes(x=coverage, y=corr)) +
geom_boxplot() +
facet_wrap(~measure) +
theme_bw()
In [ ]: