k-means algorithm

How to use Kmeans?

Kmeans(number_of_points, dimentions)

Parameters:

- number_of_points - the number of points to be generated randomally.

- dimentions [optional] - the number of dimentions for each point (2=2d, 3=3d)

Attributes:

- Kmeans.cluster(number_of_clusters)

    number_of_clusters - the number of clusters to look for

Examples

# instantiate the class with 200 random points
clusters = Kmeans(200)

# find 5 clusters
clusters.cluster(5)

In [29]:
import numpy as np
import matplotlib.pyplot as plt

class Kmeans:

    def __init__(self, number_of_points, dimentions=2):
        self.number_of_points = number_of_points
        self.dimentions = dimentions
        
        self.points = np.zeros((number_of_points, dimentions))
        for i in range(number_of_points):
            self.points[i] = (np.random.randint(100, size=1)[0],np.random.randint(100, size=1)[0])

    
    
    def __init_centers(self):
        self.centers = np.zeros((self.number_of_clusters, self.dimentions))
        
        for i in range(self.number_of_clusters):
            self.centers[i] = (np.random.randint(100, size=1)[0],np.random.randint(100, size=1)[0])

    
    
    def __init_assign_labels(self):
        for p in range(self.number_of_points):
            closest_distance = np.sqrt(np.sum((self.points[p]-self.centers[0])**2))
            for c in range(self.number_of_clusters):
                if closest_distance > np.sqrt(np.sum((self.points[p]-self.centers[c])**2)): # could also be done using np.linalg.norm
                    self.labels[p] = c
                    closest_distance = np.sqrt(np.sum((self.points[p]-self.centers[c])**2))
        
        return
    
    
    def __assign_labels(self, old_labels):
        
        self.labels_prev = np.copy(self.labels)
        
        for p in range(self.number_of_points):
            closest_distance = np.sqrt(np.sum((self.points[p]-self.labels[p])**2))
            for c in range(self.number_of_clusters):
                if closest_distance > np.sqrt(np.sum((self.points[p]-self.centers[c])**2)):
                    self.labels[p] = c
                    closest_distance = np.sqrt(np.sum((self.points[p]-self.centers[c])**2))

        compare = self.labels_prev == self.labels
        is_no_change = True
        
        for c in compare:
            if c == False:
                is_no_change = False
                break
                
        return is_no_change

        
    def __recenter(self):
        
        new_centers = np.zeros(self.centers.shape)
        cluster_points_count = np.zeros(self.number_of_clusters)

        for c in range(self.number_of_points):
            new_centers[int(self.labels[c])] = new_centers[int(self.labels[c])] + self.points[c]
            cluster_points_count[int(self.labels[c])]+=1
            
        for c in range (self.number_of_clusters):
            self.centers[c] = new_centers[c] / cluster_points_count[c]
        
        return
    
    
    
    def cluster(self, number_of_clusters=2):
        self.number_of_clusters = number_of_clusters
        self.__init_centers()
        self.labels = np.zeros(self.number_of_points)
        self.labels_prev = np.zeros(self.number_of_points)
        self.__init_assign_labels()
        
        end = False
        
        while end == False:
            self.__recenter()
            end = self.__assign_labels(self.labels)

        return self.labels, self.centers

In [74]:
clusters = Kmeans(800)
labels, centers = clusters.cluster(7)

plt.figure(0).set_size_inches(8,8)
plt.scatter(clusters.points[:,0], clusters.points[:,1], c=labels, alpha=0.6, s=40, cmap='Set2')
plt.scatter(centers[:,0], centers[:,1], c='red', s=120, marker='+')
plt.show()



In [ ]: