In [ ]:
import sklearn
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import  hierarchy
import ete3
from ete3 import Tree
import skbio
from skbio import DistanceMatrix, TreeNode
import numpy as np

from collections import Counter
from glob import glob
%matplotlib inline

In [ ]:
wip = map(DistanceMatrix.read, sorted(glob("dmat/*d2ent-*.dist")))
ip  = map(DistanceMatrix.read, sorted(glob("dmat/*d2-*.dist")))
pairs = list(zip(wip, ip))

wip_0, ip_0 = pairs[0]

In [ ]:
samples = [(x, x+6) for x in range(0, 96, 6)]
indjap = [(0, 48), (48, 96)]

sklearn Agglomerative

I can't get this to work properly. The values returned by fit_predict below should essentially be

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 [6, 6, 6, 6, 6, 6, 6, 6, 6, 6,  6,  6,  6,  6,  6,  6]]

i.e. clusters are samples 0:15, with 6 runs each


In [ ]:
cl = AgglomerativeClustering(16, compute_full_tree=True, affinity='precomputed', linkage='complete')

In [ ]:
np.unique(cl.fit_predict(wip_0.data), return_counts=True)

Scipy Hclust


In [ ]:
def does_cluster(distmat, ranges):
    hcl = hierarchy.complete(distmat.condensed_form())
    ids = distmat.ids
    tree = skbio.TreeNode.from_linkage_matrix(hcl, ids)
    tree = ete3.Tree.from_skbio(tree)
    mono = 0
    counts = Counter()
    for start, stop in ranges:
        m, phy, _ = tree.check_monophyly(ids[start:stop], target_attr="name", unrooted=True)
        mono += 1 if m else 0
        #print(start, '-', stop, "---", phy)
        counts[phy] += 1
    return mono, counts

In [ ]:
def count_clustering(groups):
    wipc = Counter()
    ipc = Counter()
    for wip, ip in pairs:
        _, c = does_cluster(wip, groups)
        wipc.update(c)
        _, c = does_cluster(ip, groups)
        ipc.update(c)
    print(sum(wipc.values()), "groups")
    return wipc, ipc
print(count_clustering(indjap))
print(count_clustering(samples))

In [ ]:
print(sum(wipc.values()))
print(sum(ipc.values()))

In [ ]: