In [ ]:
import sklearn
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy
import ete3
from ete3 import Tree
import skbio
from skbio import DistanceMatrix, TreeNode
import numpy as np
from collections import Counter
from glob import glob
%matplotlib inline
In [ ]:
wip = map(DistanceMatrix.read, sorted(glob("dmat/*d2ent-*.dist")))
ip = map(DistanceMatrix.read, sorted(glob("dmat/*d2-*.dist")))
pairs = list(zip(wip, ip))
wip_0, ip_0 = pairs[0]
In [ ]:
samples = [(x, x+6) for x in range(0, 96, 6)]
indjap = [(0, 48), (48, 96)]
In [ ]:
cl = AgglomerativeClustering(16, compute_full_tree=True, affinity='precomputed', linkage='complete')
In [ ]:
np.unique(cl.fit_predict(wip_0.data), return_counts=True)
In [ ]:
def does_cluster(distmat, ranges):
hcl = hierarchy.complete(distmat.condensed_form())
ids = distmat.ids
tree = skbio.TreeNode.from_linkage_matrix(hcl, ids)
tree = ete3.Tree.from_skbio(tree)
mono = 0
counts = Counter()
for start, stop in ranges:
m, phy, _ = tree.check_monophyly(ids[start:stop], target_attr="name", unrooted=True)
mono += 1 if m else 0
#print(start, '-', stop, "---", phy)
counts[phy] += 1
return mono, counts
In [ ]:
def count_clustering(groups):
wipc = Counter()
ipc = Counter()
for wip, ip in pairs:
_, c = does_cluster(wip, groups)
wipc.update(c)
_, c = does_cluster(ip, groups)
ipc.update(c)
print(sum(wipc.values()), "groups")
return wipc, ipc
print(count_clustering(indjap))
print(count_clustering(samples))
In [ ]:
print(sum(wipc.values()))
print(sum(ipc.values()))
In [ ]: