In [ ]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np
from scipy import stats
%matplotlib inline
%load_ext rpy2.ipython
In [ ]:
%%R
library(tidyr)
library(dplyr)
library(ggplot2)
library(Cairo)
In [ ]:
expt_results = []
truths = {
"wip": DistanceMatrix.read("kwip/full_wip.dist"),
"ip": DistanceMatrix.read("kwip/full_ip.dist"),
}
for distfile in glob("kwip/*.dist"):
try:
cov, metric = re.search(r'kwip/(\d\.?\d*)x_(wip|ip).dist', distfile).groups()
except AttributeError:
cov, metric = re.search(r'kwip/(full)_(wip|ip).dist', distfile).groups()
dm = DistanceMatrix.read(distfile)
truth = truths[metric]
truth = truths['wip']
r = stats.pearsonr(dm.condensed_form(), truth.condensed_form())[0]
rho = stats.spearmanr(dm.condensed_form(), truth.condensed_form())[0]
expt_results.append({
"coverage": cov,
"metric": metric,
"correlation": r,
"rho": rho
})
results = pd.DataFrame(expt_results)
In [ ]:
%%R -i results
results$coverage = as.character(results$coverage)
results$coverage[results$coverage == "full"] = 230
results$coverage = as.numeric(results$coverage)
print(summary(results))
str(results)
In [ ]:
%%R
results = results[results$metric == "wip", ]
p = ggplot(results, aes(x=coverage, y=rho)) +
geom_line() +
ylim(0, 1) +
labs(x="Genome Coverage", y=expression(paste("Spearman's ", rho))) +
theme_bw()
svg('aggregation_fullcov.svg', width=, height=4)
print(p)
dev.off()
print(p)
svg('aggregation_100.svg', width=3.5, height=3.5)
print(p + xlim(0,100))
dev.off()
print(p + xlim(0,100))
svg('aggregation_log.svg', width=4, height=4)
print(p + scale_x_log10())
dev.off()
print(p + scale_x_log10())
In [ ]:
%%R
x =theme_bw()
x
In [ ]: