In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
%matplotlib inline
np.set_printoptions(precision=5, suppress=True) # suppress scientific float notation
from matplotlib import pyplot as plt
In [2]:
with open ('DM2016_org.txt') as f:
d = {}
headers = f.readline().split(' ')
values = map(lambda x:x.split(),f.readlines())
for i in range(len(values[0])):
d[i]=[]
for v in values:
d[i].append(v[i])
data = pd.DataFrame(d)
npdata = data[data.columns[1:]].as_matrix()
npdata = np.array(npdata,dtype=int)
In [41]:
random_state = 170
nc = 30
y_pred = KMeans(n_clusters=nc, random_state=random_state,precompute_distances=True).fit_predict(npdata)
In [42]:
output=[]
for i in range(nc):
print i
map(lambda x:output.append(x),map(lambda x:data[data.index[0]][x].values,np.where(y_pred==i))[0])
In [44]:
f = open('Kmeans_30.txt','w')
for element in output:
f.write('\n{}'.format(element))
f.close()
In [3]:
z = linkage(npdata,'ward')
In [45]:
c, coph_dists = cophenet(z, pdist(npdata))
In [42]:
z[0:10]
Out[42]:
In [44]:
z[-4:,2]
Out[44]:
In [14]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
In [15]:
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
z,
truncate_mode='lastp', # show only the last p merged clusters
p=12, # show only the last p merged clusters
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches
)
plt.show()
In [16]:
def fancy_dendrogram(*args, **kwargs):
max_d = kwargs.pop('max_d', None)
if max_d and 'color_threshold' not in kwargs:
kwargs['color_threshold'] = max_d
annotate_above = kwargs.pop('annotate_above', 0)
ddata = dendrogram(*args, **kwargs)
if not kwargs.get('no_plot', False):
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = 0.5 * sum(i[1:3])
y = d[1]
if y > annotate_above:
plt.plot(x, y, 'o', c=c)
plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center')
if max_d:
plt.axhline(y=max_d, c='k')
return ddata
In [17]:
fancy_dendrogram(
z,
truncate_mode='lastp',
p=12,
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True,
annotate_above=10,
max_d=30000, # plot a horizontal cut-off line
)
plt.show()
In [ ]: