Given a set $X$ of $n$ observations; $X = \{x_1, x_2, \cdots, x_n\}$, where every $i$ observation is a vector of measurements ($x_i \in R^d$). The K-means clustering aims to classify the observations into a set of $k$ clusters $C =\{c_1, c_2, \cdots, c_k\}$ that minimizes the following expression:
$ \displaystyle{\operatorname{arg\,min}} \sum_{\mathbf x \in C_i} \left\| \mathbf x - \boldsymbol\mu_i \right\|^2 $
In [1]:
%pylab inline
import matplotlib
#matplotlib.rc('xtick', labelsize=20)
#matplotlib.rc('ytick', labelsize=20)
from scipy.spatial import distance
Consider the following data set consisting of the scores of two variables on each of 17 experiments
In [2]:
x = np.loadtxt("data.txt", comments='//')
x.shape
print(x.shape)
In [3]:
# Plot 2 measurements
#for i in x:
# plt.plot(i[0],i[1], 'ko');
plt.scatter(x[:,0], x[:,1], color='black')
plt.xlim(-0.5, 10);
plt.ylim(-0.5, 10);
plt.xlabel('Measurement 1');
plt.ylabel('Measurement 2');
This data set is to be grouped into two clusters. As a first step in finding a sensible initial partition, let the values of the measurements 1 and 2 of the two individuals
In [4]:
centroid1 = x[0] # first experiment
centroid2 = x[3] # fourth experiment
print(centroid1, centroid2)
In [5]:
# calculate Euclidean distances from centroid 1
sample1= list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid1)
sample1.append(dist)
print("%2d -> %6f" %(i, dist))
In [6]:
# calculate Euclidean distances from centroid 2
sample2 = list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid2)
sample2.append(dist)
print("%2d -> %6f" %(i, dist))
In [7]:
group1 = x[np.array(sample1)<=np.array(sample2)]
print(group1)
In [8]:
group2 = x[np.array(sample2)<np.array(sample1)]
print(group2)
In [9]:
# Replot first categorization
# Plot 2 measurements
for i in group1:
plt.plot(i[0],i[1], 'o',color='magenta');
for i in group2:
plt.plot(i[0],i[1], 'o', color='cyan');
plt.plot(centroid1[0], centroid1[1], '*', color='magenta', ms=12)
plt.plot(centroid2[0], centroid2[1], '*', color='cyan', ms=12)
plt.xlim(-0.5, 10);
plt.ylim(-0.5, 10);
plt.xlabel('Measurement 1');
plt.ylabel('Measurement 2');
In [10]:
# recalculate centroids
centroid1 = np.average(group1, axis=0)
centroid2 = np.average(group2, axis=0)
print(centroid1, centroid2)
In [11]:
# calculate Euclidean distances from new centroid 1
sample1= list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid1)
sample1.append(dist)
print("%2d -> %6f" %(i, dist))
In [12]:
sample2= list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid2)
sample2.append(dist)
print("%2d -> %6f" %(i, dist))
In [13]:
# reasign the groups
group1 = x[np.array(sample1)<=np.array(sample2)]
print(group1)
In [14]:
group2 = x[np.array(sample2)<np.array(sample1)]
print(group2)
In [15]:
# Replot first categorization with new centrodis
# Plot 2 measurements
for i in group1:
plt.plot(i[0],i[1], 'o', color='magenta');
for i in group2:
plt.plot(i[0],i[1], 'o', color='cyan');
plt.plot(centroid1[0], centroid1[1], '*', color='magenta', ms=12)
plt.plot(centroid2[0], centroid2[1], '*', color='cyan', ms=12)
plt.xlim(-0.5, 10);
plt.ylim(-0.5, 10);
plt.xlabel('Measurement 1');
plt.ylabel('Measurement 2');
In [16]:
# recalculate centroids
centroid1 = np.average(group1, axis=0)
centroid2 = np.average(group2, axis=0)
print(centroid1, centroid2)
In [17]:
# calculate Euclidean distances from new centroid 1
sample1= list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid1)
sample1.append(dist)
# calculate Euclidean distances from centroid 2
sample2 = list()
for i, obs in enumerate(x):
dist = distance.euclidean(obs, centroid2)
sample2.append(dist)
# reasign the groups
group1 = x[np.array(sample1)<=np.array(sample2)]
group2 = x[np.array(sample2)<np.array(sample1)]
In [18]:
# Replot first categorization with new centrodis
# Plot 2 measurements
for i in group1:
plt.plot(i[0],i[1], 'o', color='magenta');
for i in group2:
plt.plot(i[0],i[1], 'o', color='cyan');
plt.plot(centroid1[0], centroid1[1], '*', color='magenta', ms=12)
plt.plot(centroid2[0], centroid2[1], '*', color='cyan', ms=12)
plt.xlim(-0.5, 10);
plt.ylim(-0.5, 10);
plt.xlabel('Measurement 1');
plt.ylabel('Measurement 2');
In [19]:
# recalculate centroids
centroid1 = np.average(group1, axis=0)
centroid2 = np.average(group2, axis=0)
print(centroid1, centroid2)