In [ ]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np
from scipy import stats

%matplotlib inline
%load_ext rpy2.ipython

In [ ]:
%%R
library(tidyr)
library(dplyr)
library(ggplot2)
library(Cairo)

In [ ]:
expt_results = []

truths = {
    "wip": DistanceMatrix.read("kwip/full_wip.dist"),
    "ip": DistanceMatrix.read("kwip/full_ip.dist"),
}

for distfile in glob("kwip/*.dist"):
    try:
        cov, metric = re.search(r'kwip/(\d\.?\d*)x_(wip|ip).dist', distfile).groups()
    except AttributeError:
        cov, metric = re.search(r'kwip/(full)_(wip|ip).dist', distfile).groups()
        
    dm = DistanceMatrix.read(distfile)
    
    truth = truths[metric]
    truth = truths['wip']
    r = stats.pearsonr(dm.condensed_form(), truth.condensed_form())[0]
    rho = stats.spearmanr(dm.condensed_form(), truth.condensed_form())[0]
    expt_results.append({
        "coverage": cov,
        "metric": metric,
        "correlation": r,
        "rho": rho
    })
    
results = pd.DataFrame(expt_results)

In [ ]:
%%R -i results


results$coverage = as.character(results$coverage)
results$coverage[results$coverage == "full"] = 230
results$coverage = as.numeric(results$coverage)

print(summary(results))
str(results)

In [ ]:
%%R
results = results[results$metric == "wip", ]

p = ggplot(results, aes(x=coverage, y=rho)) +
    geom_line() +
    ylim(0, 1) +
    labs(x="Genome Coverage", y=expression(paste("Spearman's ", rho))) +
    theme_bw()

svg('aggregation_fullcov.svg', width=, height=4)
print(p)
dev.off()
print(p)


svg('aggregation_100.svg', width=3.5, height=3.5)
print(p + xlim(0,100))
dev.off()
print(p + xlim(0,100))

svg('aggregation_log.svg', width=4, height=4)
print(p + scale_x_log10())
dev.off()
print(p + scale_x_log10())

In [ ]:
%%R

x =theme_bw()
x

In [ ]: