In [ ]:
import skbio
#from skbio import TreeNode, DistanceMatrix, Alignment
from skbio import TreeNode, DistanceMatrix, TabularMSA, DNA
#import statsmodels.api as sm
import numpy as np
from matplotlib import pyplot as plt
from scipy.spatial.distance import hamming

%matplotlib inline

In [ ]:
aln = TabularMSA.read("../data/all_genomes-0.01.fasta", constructor=DNA)
aln.reassign_index(minter="id")

In [ ]:
dist = DistanceMatrix.from_iterable(
    [seq.values for seq in aln], metric=hamming, keys=aln.index)

In [ ]:
dist

In [ ]:
1/dist['G']["ABCDEFGHIJKL".index("F")]

In [ ]:
samp = skbio.read("../data/sample.nwk", into=TreeNode)

In [ ]:
tdist = samp.tip_tip_distances()

In [ ]:
num_runs = 3
runs = DistanceMatrix(
    np.repeat(np.repeat(dist.data,num_runs, axis=1), num_runs, axis=0))
runs.ids = ['{}-{}'.format(g, i) for g in dist.ids for i in range(num_runs)]
runs.write("../data/runs.dist")

In [ ]:
truth = runs.condensed_form()

In [ ]:
f=runs.plot(title="Truth")
f.set_size_inches(9, 6)
f.savefig("Truth-mat.png")

In [ ]:
kwip_mat = DistanceMatrix.read("../data/kwip/30x-0.01-wip.dist")

In [ ]:
f=kwip_mat.plot(title="WIP")
f.set_size_inches(9, 6)
f.savefig("WIP-mat.png")

In [ ]:
ip_mat = DistanceMatrix.read("../data/kwip/30x-0.01-ip.dist")

In [ ]:
f=ip_mat.plot(title="IP")
f.set_size_inches(9,6)
f.savefig("IP-mat.png")

In [ ]:
wip = kwip_mat.condensed_form()
ip = ip_mat.condensed_form()

In [ ]:
fig, ax = plt.subplots()
ax.scatter(wip, ip, marker='x')
ax.set_ylabel('IP')
ax.set_xlabel('WIP')

In [ ]:
fig, ax = plt.subplots(figsize=(9,6))
w = ax.scatter(truth, wip, marker='x', color='r', label='WIP')
i = ax.scatter(truth, ip, marker='o', color='b', label='IP')
ax.set_xlabel("True distance")
ax.set_ylabel("kWIP")
ax.set_xlim(-0.001,0.023)
ax.legend(handles=[w, i], loc='upper left')
fig.tight_layout()

In [ ]:
fig.savefig("WIPvsIPvsTruth.png")

In [ ]:
import scipy as sp

In [ ]:
sp.stats.spearmanr(truth, wip)

In [ ]:
sp.stats.spearmanr(truth, ip)

In [ ]:
X = sm.add_constant(ip)
o = sm.OLS(wip, X)
f = o.fit()
print(f.rsquared_adj, f.ssr)

In [ ]:
def regress(y):
    X = sm.add_constant(truth)
    o = sm.OLS(y, X)
    f = o.fit()
    return f

In [ ]:
wip_fit = regress(wip)
print(wip_fit.rsquared_adj, wip_fit.ssr)
_ =sm.graphics.plot_ccpr_grid(wip_fit)

In [ ]:
ip_fit = regress(ip)
print(ip_fit.rsquared_adj, ip_fit.ssr)
_ =sm.graphics.plot_ccpr_grid(ip_fit)