In [13]:
import os
import struct
import numpy as np

In [14]:
def read(dataset = "training", path = "."):

    if dataset is "training":
        fname_img = os.path.join(path, 'train-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
    elif dataset is "testing":
        fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
    else:
        raise ValueError("dataset must be 'testing' or 'training'")

    with open(fname_lbl, 'rb') as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        lbl = np.fromfile(flbl, dtype=np.int8)

    with open(fname_img, 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)

    return lbl, img

In [15]:
trainLbl, trainData = read()
testLbl, testData = read('testing')

In [16]:
np.ndarray.flatten(trainData[0])

data = np.array([ np.ndarray.flatten(trainData[i]) for i in range(trainLbl.size)])
data.shape


Out[16]:
(60000, 784)

In [17]:
def getRandomPoints(data, n):
    res = []
    for i in range(0,n):
        res.append(data[np.random.randint(0,data.shape[0])])
    
    return np.array(res);

In [18]:
def createClusters(means, data):
    n = means.shape[0];
    result = [];
    for p in range(n):
        result.append([]);
    
    for d in data:
        arr = [];
        for i in range(n):
            arr.append(np.linalg.norm(d-means[i]))
        arr = np.array(arr)
        result[np.argmin(arr)].append(d);
    
    res = []
    for q in range(0,n):
        res.append(np.array(result[q]));
        
    return res;

In [19]:
def getMeans(V):
    n = len(V)
    result = [];
    for i in range(n):
        result.append(np.mean(V[i], axis=0));
    
    return np.array(result);

In [20]:
def kMeans(lbl, data, I = 1000, k = 10):
    means = getRandomPoints(data, k)
    
    for i in range(I):
        preMeans = means
        V = createClusters(means,data)
        means = getMeans(V);
        print(i)
        if np.array_equal(means, preMeans):
            break;
            
    lblMeans = labelMeans(lbl, data , means)
    
    return means, lblMeans;

In [21]:
def getClusterID(point, means):
    
    clusters = createClusters(means, np.array(point))
    
    for i in range(means.shape[0]):
        if clusters[i].size > 0:
            return i;
    
    return -1;

In [22]:
def labelMeans(lbl, data, means):
    
    res = []
    count = []
    
    for i in range(means.shape[0]):
        res.append(0)
        count.append(0)
    
    
    for k in range(lbl.size):
        c = getCluster(means, data[k])
        count[c] += 1
        res[c] += lbl[k]
    
    for i in range(len(res)):
        res[i] /= count[i];
        res[i] = round(res[i],0)
    
    return np.array(res)

In [23]:
def getCluster(means, data):
    n = means.shape[0];
    result = [];
    for p in range(n):
        result.append([]);
    
    arr = [];
    for i in range(n):
        arr.append(np.linalg.norm(data-means[i]))
    arr = np.array(arr)
    
    return np.argmin(arr)

In [26]:
means, lblMeans = kMeans(trainLbl, data, 40, 100)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

In [28]:
count = 0
for i in range(testLbl.size):
    num = getCluster(means, np.ndarray.flatten(testData[i]))
    if lblMeans[num] != testLbl[i]:
        count += 1

In [27]:
lblMeans


Out[27]:
array([ 7.,  5.,  0.,  0.,  8.,  6.,  6.,  2.,  9.,  2.,  2.,  4.,  5.,
        6.,  2.,  6.,  5.,  3.,  4.,  3.,  7.,  5.,  6.,  7.,  0.,  3.,
        4.,  7.,  8.,  4.,  3.,  6.,  8.,  3.,  0.,  2.,  3.,  5.,  4.,
        3.,  0.,  5.,  2.,  2.,  6.,  7.,  8.,  3.,  6.,  3.,  7.,  3.,
        5.,  6.,  0.,  0.,  4.,  1.,  6.,  7.,  2.,  1.,  4.,  5.,  0.,
        7.,  8.,  5.,  8.,  2.,  5.,  6.,  9.,  7.,  6.,  6.,  7.,  0.,
        4.,  0.,  1.,  7.,  8.,  7.,  2.,  5.,  1.,  6.,  2.,  7.,  1.,
        6.,  8.,  6.,  6.,  5.,  8.,  5.,  1.,  0.])

In [29]:
count


Out[29]:
2364

In [34]:
error_rate = count / 10000 *100

In [35]:
error_rate


Out[35]:
23.65

In [39]:
def kMeans2(lbl, pre_means, data, I = 1000):
    means = pre_means
    
    for i in range(I):
        preMeans = means
        V = createClusters(means,data)
        means = getMeans(V);
        print(i)
        if np.array_equal(means, preMeans):
            break;
            
    lblMeans = labelMeans(lbl, data , means)
    
    return means, lblMeans;

In [40]:
means, lblMeans = kMeans2(trainLbl, means, data, 40)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

In [41]:
lblMeans


Out[41]:
array([ 7.,  5.,  0.,  0.,  8.,  6.,  6.,  2.,  9.,  2.,  2.,  4.,  5.,
        6.,  2.,  6.,  5.,  3.,  4.,  3.,  7.,  5.,  6.,  7.,  0.,  3.,
        4.,  8.,  7.,  4.,  3.,  7.,  8.,  3.,  0.,  2.,  3.,  5.,  4.,
        3.,  0.,  5.,  2.,  2.,  6.,  7.,  8.,  3.,  6.,  3.,  6.,  3.,
        5.,  6.,  0.,  0.,  4.,  1.,  6.,  7.,  2.,  1.,  4.,  5.,  0.,
        7.,  8.,  5.,  8.,  2.,  5.,  6.,  9.,  7.,  6.,  5.,  7.,  0.,
        4.,  0.,  1.,  7.,  8.,  7.,  2.,  5.,  1.,  6.,  2.,  7.,  2.,
        6.,  8.,  6.,  6.,  5.,  8.,  5.,  1.,  0.])

In [42]:
count = 0
for i in range(testLbl.size):
    num = getCluster(means, np.ndarray.flatten(testData[i]))
    if lblMeans[num] != testLbl[i]:
        count += 1

In [43]:
error_rate = count / 10000 *100
error_rate


Out[43]:
24.16

In [ ]: