In [ ]:

    
import sklearn
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import  hierarchy
import ete3
from ete3 import Tree
import skbio
from skbio import DistanceMatrix, TreeNode
import numpy as np

from collections import Counter
from glob import glob
%matplotlib inline



In [ ]:

    
wip = map(DistanceMatrix.read, sorted(glob("dmat/*d2ent-*.dist")))
ip  = map(DistanceMatrix.read, sorted(glob("dmat/*d2-*.dist")))
pairs = list(zip(wip, ip))

wip_0, ip_0 = pairs[0]



In [ ]:

    
samples = [(x, x+6) for x in range(0, 96, 6)]
indjap = [(0, 48), (48, 96)]

sklearn Agglomerative

I can't get this to work properly. The values returned by fit_predict below should essentially be

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 [6, 6, 6, 6, 6, 6, 6, 6, 6, 6,  6,  6,  6,  6,  6,  6]]

i.e. clusters are samples 0:15, with 6 runs each



In [ ]:

    
cl = AgglomerativeClustering(16, compute_full_tree=True, affinity='precomputed', linkage='complete')



In [ ]:

    
np.unique(cl.fit_predict(wip_0.data), return_counts=True)

Scipy Hclust



In [ ]:

    
def does_cluster(distmat, ranges):
    hcl = hierarchy.complete(distmat.condensed_form())
    ids = distmat.ids
    tree = skbio.TreeNode.from_linkage_matrix(hcl, ids)
    tree = ete3.Tree.from_skbio(tree)
    mono = 0
    counts = Counter()
    for start, stop in ranges:
        m, phy, _ = tree.check_monophyly(ids[start:stop], target_attr="name", unrooted=True)
        mono += 1 if m else 0
        #print(start, '-', stop, "---", phy)
        counts[phy] += 1
    return mono, counts



In [ ]:

    
def count_clustering(groups):
    wipc = Counter()
    ipc = Counter()
    for wip, ip in pairs:
        _, c = does_cluster(wip, groups)
        wipc.update(c)
        _, c = does_cluster(ip, groups)
        ipc.update(c)
    print(sum(wipc.values()), "groups")
    return wipc, ipc
print(count_clustering(indjap))
print(count_clustering(samples))



In [ ]:

    
print(sum(wipc.values()))
print(sum(ipc.values()))



In [ ]: