In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation
from matplotlib import pyplot as plt

In [2]:
with open ('DM2016_org.txt') as f:
    d = {}
    headers = f.readline().split(' ')
    values = map(lambda x:x.split(),f.readlines())
    for i in range(len(values[0])):
        d[i]=[]
        for v in values:
            d[i].append(v[i])
data = pd.DataFrame(d)
npdata = data[data.columns[1:]].as_matrix()
npdata = np.array(npdata,dtype=int)

Using K-Means Algorithm


In [41]:
random_state = 170
nc = 30
y_pred = KMeans(n_clusters=nc, random_state=random_state,precompute_distances=True).fit_predict(npdata)

In [42]:
output=[]
for i in range(nc):
    print i
    map(lambda x:output.append(x),map(lambda x:data[data.index[0]][x].values,np.where(y_pred==i))[0])


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

In [44]:
f = open('Kmeans_30.txt','w')
for element in output:
    f.write('\n{}'.format(element))
f.close()

Using Scipy


In [3]:
z = linkage(npdata,'ward')

In [45]:
c, coph_dists = cophenet(z, pdist(npdata))

In [42]:
z[0:10]


Out[42]:
array([[  85.     ,  810.     ,   66.89544,    2.     ],
       [ 456.     ,  989.     ,   67.02238,    2.     ],
       [ 143.     ,  930.     ,   67.97794,    2.     ],
       [ 789.     ,  956.     ,   68.08818,    2.     ],
       [ 813.     ,  945.     ,   68.70226,    2.     ],
       [ 671.     ,  852.     ,   68.89122,    2.     ],
       [ 282.     ,  977.     ,   69.07966,    2.     ],
       [ 286.     ,  805.     ,   69.16647,    2.     ],
       [ 168.     ,  348.     ,   69.46222,    2.     ],
       [ 758.     ,  779.     ,   69.88562,    2.     ]])

In [44]:
z[-4:,2]


Out[44]:
array([ 22103.69068,  28154.02222,  29079.19447,  41237.43501])

In [14]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()



In [15]:
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
    z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=12,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()



In [16]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [17]:
fancy_dendrogram(
    z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,
    max_d=30000,  # plot a horizontal cut-off line
)
plt.show()



In [ ]: