In [ ]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np
import scipy as sp

from kwipexpt import *
%matplotlib inline
%load_ext rpy2.ipython

In [ ]:
%%R
library(tidyr)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2)
library(reshape2)

Calculate performance of kWIP

The next bit of python code calculates the performance of kWIP against the distance between samples calulcated from the alignments of their genomes.

This code caluclates spearman's $\rho$ between the off-diagonal elements of the triagnular distance matrices.


In [ ]:
expts = list(map(lambda fp: path.basename(fp.rstrip('/')), glob('data/*/')))
print("Number of replicate experiments:", len(expts))

In [ ]:
def process_expt(expt):
    expt_results = []
    
    def extract_info(filename):
        return re.search(r'kwip/(\d\.?\d*)x-(0\.\d+)-(wip|ip).dist', filename).groups()
    def r_sqrt(truth, dist):
        return sp.stats.pearsonr(truth, np.sqrt(dist))[0]
    def rho_sqrt(truth, dist):
        return sp.stats.spearmanr(truth, np.sqrt(dist)).correlation
        
    
    # dict of scale: distance matrix, populated as we go
    truths = {}
    
    truth_points = []
    sim_points = []
    for distfile in glob("data/{}/kwip/*.dist".format(expt)):
        cov, scale, metric = extract_info(distfile)
        if scale not in truths:
            genome_dist_path = 'data/{ex}/all_genomes-{sc}.dist'.format(ex=expt, sc=scale)
            truths[scale] = load_sample_matrix_to_runs(genome_dist_path)
        exptmat = DistanceMatrix.read(distfile)
        rho = distmat_corr(truths[scale], exptmat, stats.spearmanr).correlation
        rho2 = distmat_corr(truths[scale], exptmat, rho_sqrt)
        r = distmat_corr(truths[scale], exptmat, stats.pearsonr)[0]
        r2 = distmat_corr(truths[scale], exptmat, r_sqrt)
        if cov == "100" and scale == "0.001" and metric == "wip":
            truth_points.append(truths[scale].condensed_form())
            sim_points.append(exptmat.condensed_form())
        expt_results.append({
            "coverage": cov,
            "scale": scale,
            "metric": metric,
            "rho": rho,
            "rhosqrt": rho2,
            "r": r,
            "rsqrt": r2,
            "seed": expt,
        })
    return expt_results, (truth_points, sim_points)

#process_expt('3662')

In [ ]:
results = []
truepoints = []
simpoints = []
for res in map(process_expt, expts):
    results.extend(res[0])
    truepoints.extend(res[1][0])
    simpoints.extend(res[1][1])
results = pd.DataFrame(results)

In [ ]:
truepoints = np.concatenate(truepoints)
simpoints = np.concatenate(simpoints)

In [ ]:
%%R -i truepoints -i simpoints

plot(truepoints, sqrt(simpoints), pch=".")

Visualisation

Below we see a summary and structure of the data


In [ ]:
%%R -i results

results$coverage = as.numeric(as.character(results$coverage))
results$scale = as.numeric(as.character(results$scale))

print(summary(results))
str(results)

In [ ]:
%%R

# AND AGAIN WITHOUT SUBSETTING
dat = results %>%
      filter(scale==0.001, metric=="wip") %>%
      select(coverage, rho, r, rsqrt, rhosqrt)
mdat = melt(dat, id.vars="coverage", variable.name="measure", value.name="corr")
mdat$coverage = as.factor(mdat$coverage)

ggplot(mdat, aes(x=coverage, y=corr)) +
    geom_boxplot() +
    facet_wrap(~measure) +
    theme_bw()

In [ ]: