Hierarchical clustering (scipy.cluster.hierarchy)

linkage(y[, method, metric]) Performs hierarchical/agglomerative clustering on the condensed distance matrix y.

http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html#module-scipy.cluster.hierarchy


In [5]:
import numpy as np
import scipy.cluster.hierarchy as hac
import matplotlib.pyplot as plt
from consolidation.db import connect_db
from consolidation.position import Position, Cluster
from consolidation.algorithms.canopy import canopy

In [9]:
# Connecting to database
cur= connect_db("bahia")
recurso = "tetra:12082781"
limit = 10
cmd = "SELECT latitud, longitud, UNIX_TIMESTAMP(fecha) FROM posicionesgps WHERE latitud <> 0 and longitud <> 0 and recurso=\"{0}\";".format(recurso)
cmd


Out[9]:
'SELECT latitud, longitud, UNIX_TIMESTAMP(fecha) FROM posicionesgps WHERE latitud <> 0 and longitud <> 0 and recurso="tetra:12082781";'

In [10]:
cur.execute(cmd)


Out[10]:
39839L

In [11]:
a = []

In [12]:
for pos in cur.fetchall():
    a.append(pos)

In [13]:
len(a)


Out[13]:
39839

In [14]:
a = np.array(a)
z = hac.linkage(a, method=method)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-7b1725440b7c> in <module>()
      1 a = np.array(a)
----> 2 z = hac.linkage(a, method=method)

NameError: name 'method' is not defined

In [21]:
# Linkage

fig, axes23 = plt.subplots(2, 3)

for method, axes in zip(['single', 'complete'], axes23):
    z = hac.linkage(a, method=method)

    # Plotting
    axes[0].plot(range(1, len(z)+1), z[::-1, 2])
    knee = np.diff(z[::-1, 2], 2)
    axes[0].plot(range(2, len(z)), knee)

    num_clust1 = knee.argmax() + 2
    knee[knee.argmax()] = 0
    num_clust2 = knee.argmax() + 2

    axes[0].text(num_clust1, z[::-1, 2][num_clust1-1], 'possible\n<- knee point')

    part1 = hac.fcluster(z, num_clust1, 'maxclust')
    part2 = hac.fcluster(z, num_clust2, 'maxclust')

    clr = ['#2200CC' ,'#D9007E' ,'#FF6600' ,'#FFCC00' ,'#ACE600' ,'#0099CC' ,
    '#8900CC' ,'#FF0000' ,'#FF9900' ,'#FFFF00' ,'#00CC01' ,'#0055CC']

    for part, ax in zip([part1, part2], axes[1:]):
        for cluster in set(part):
            ax.scatter(a[part == cluster, 0], a[part == cluster, 1], 
                       color=clr[cluster])

    m = '\n(method: {})'.format(method)
    plt.setp(axes[0], title='Screeplot{}'.format(m), xlabel='partition',
             ylabel='{}\ncluster distance'.format(m))
    plt.setp(axes[1], title='{} Clusters'.format(num_clust1))
    plt.setp(axes[2], title='{} Clusters'.format(num_clust2))

plt.tight_layout()
plt.show()

#


In [ ]: