Comparing Front and Back end clustering


In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

In [4]:
df = pd.read_csv('output_1.13819231887', sep='\t', header=None)

In [5]:
inst_dm = df.values
print(inst_dm.shape)


(38, 38)

In [6]:
inst_dm


Out[6]:
array([[ 0.        ,  1.13819232,  1.0733403 , ...,  0.94640097,
         0.94702352,  0.95850345],
       [ 1.13819232,  0.        ,  0.93692049, ...,  1.06063839,
         0.81090475,  1.03730952],
       [ 1.0733403 ,  0.93692049,  0.        , ...,  1.03371267,
         1.05430428,  1.09115233],
       ..., 
       [ 0.94640097,  1.06063839,  1.03371267, ...,  0.        ,
         1.08551295,  0.92644981],
       [ 0.94702352,  0.81090475,  1.05430428, ...,  1.08551295,
         0.        ,  1.14962644],
       [ 0.95850345,  1.03730952,  1.09115233, ...,  0.92644981,
         1.14962644,  0.        ]])

In [7]:
linkage_type = 'average'
import scipy.cluster.hierarchy as hier
inst_dm = squareform(inst_dm)
Y = hier.linkage(inst_dm, method=linkage_type)
Z = hier.dendrogram(Y, no_plot=True)

In [8]:
def group_cutoffs():
  all_dist = []
  for i in range(11):
    all_dist.append(float(i) / 10)
  return all_dist

In [9]:
inst_clust_order = Z['leaves']
all_dist = group_cutoffs()

In [10]:
def group_cutoffs():
  all_dist = []
  for i in range(11):
    all_dist.append(float(i) / 10)
  return all_dist

groups = {}

for inst_dist in all_dist: inst_key = str(inst_dist).replace('.', '')

cutoff_dist = inst_dist * inst_dm.max()

groups[inst_key] = hier.fcluster(Y, inst_dist * inst_dm.max(), 'distance')
groups[inst_key] = groups[inst_key].tolist()


print(cutoff_dist)

print(groups[inst_key])


In [11]:
hier.fcluster(Y, 2.5, 'distance')


Out[11]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [12]:
hier.fcluster(Y, 0.91322, 'distance')


Out[12]:
array([3, 5, 6, 6, 3, 8, 3, 8, 7, 8, 1, 8, 1, 3, 6, 3, 8, 4, 3, 2, 6, 7, 5,
       1, 4, 2, 7, 4, 8, 3, 6, 4, 1, 2, 1, 6, 1, 7], dtype=int32)

In [13]:
hier.fcluster(Y, 1.1, 'distance')


Out[13]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

Cutoff for splitting into first group

1.07793 gives two groups


In [14]:
sorted(list(set(list(hier.fcluster(Y, 1.07794, 'distance')))))


Out[14]:
[1]

In [15]:
sorted(list(set(list(hier.fcluster(Y, 1.07793, 'distance')))))


Out[15]:
[1, 2]

1.06 gives two groups


In [16]:
sorted(list(set(list(hier.fcluster(Y, 1.06, 'distance')))))


Out[16]:
[1, 2]

1.0517 gives two groups


In [17]:
sorted(list(set(list(hier.fcluster(Y, 1.0517, 'distance')))))


Out[17]:
[1, 2]

1.05 gives three groups


In [18]:
sorted(list(set(list(hier.fcluster(Y, 1.05, 'distance')))))


Out[18]:
[1, 2, 3]

1.04 gives 4 groups


In [19]:
sorted(list(set(list(hier.fcluster(Y, 1.04, 'distance')))))


Out[19]:
[1, 2, 3, 4]

1.03 gives 5 groups


In [20]:
sorted(list(set(list(hier.fcluster(Y, 1.03, 'distance')))))


Out[20]:
[1, 2, 3, 4, 5]

In [ ]:

0.07


In [21]:
sorted(list(set(list(hier.fcluster(Y, 0.07, 'distance')))))


Out[21]:
[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37]

0.70 gives 22 groups


In [22]:
sorted(list(set(list(hier.fcluster(Y, 0.70, 'distance')))))


Out[22]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

0.01 gives 38 groups


In [23]:
sorted(list(set(list(hier.fcluster(Y, 0.01, 'distance')))))


Out[23]:
[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38]

In [ ]:


In [ ]: