In [13]:
import os
import struct
import numpy as np
In [14]:
def read(dataset = "training", path = "."):
if dataset is "training":
fname_img = os.path.join(path, 'train-images.idx3-ubyte')
fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
elif dataset is "testing":
fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
else:
raise ValueError("dataset must be 'testing' or 'training'")
with open(fname_lbl, 'rb') as flbl:
magic, num = struct.unpack(">II", flbl.read(8))
lbl = np.fromfile(flbl, dtype=np.int8)
with open(fname_img, 'rb') as fimg:
magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)
return lbl, img
In [15]:
trainLbl, trainData = read()
testLbl, testData = read('testing')
In [16]:
np.ndarray.flatten(trainData[0])
data = np.array([ np.ndarray.flatten(trainData[i]) for i in range(trainLbl.size)])
data.shape
Out[16]:
In [17]:
def getRandomPoints(data, n):
res = []
for i in range(0,n):
res.append(data[np.random.randint(0,data.shape[0])])
return np.array(res);
In [18]:
def createClusters(means, data):
n = means.shape[0];
result = [];
for p in range(n):
result.append([]);
for d in data:
arr = [];
for i in range(n):
arr.append(np.linalg.norm(d-means[i]))
arr = np.array(arr)
result[np.argmin(arr)].append(d);
res = []
for q in range(0,n):
res.append(np.array(result[q]));
return res;
In [19]:
def getMeans(V):
n = len(V)
result = [];
for i in range(n):
result.append(np.mean(V[i], axis=0));
return np.array(result);
In [20]:
def kMeans(lbl, data, I = 1000, k = 10):
means = getRandomPoints(data, k)
for i in range(I):
preMeans = means
V = createClusters(means,data)
means = getMeans(V);
print(i)
if np.array_equal(means, preMeans):
break;
lblMeans = labelMeans(lbl, data , means)
return means, lblMeans;
In [21]:
def getClusterID(point, means):
clusters = createClusters(means, np.array(point))
for i in range(means.shape[0]):
if clusters[i].size > 0:
return i;
return -1;
In [22]:
def labelMeans(lbl, data, means):
res = []
count = []
for i in range(means.shape[0]):
res.append(0)
count.append(0)
for k in range(lbl.size):
c = getCluster(means, data[k])
count[c] += 1
res[c] += lbl[k]
for i in range(len(res)):
res[i] /= count[i];
res[i] = round(res[i],0)
return np.array(res)
In [23]:
def getCluster(means, data):
n = means.shape[0];
result = [];
for p in range(n):
result.append([]);
arr = [];
for i in range(n):
arr.append(np.linalg.norm(data-means[i]))
arr = np.array(arr)
return np.argmin(arr)
In [26]:
means, lblMeans = kMeans(trainLbl, data, 40, 100)
In [28]:
count = 0
for i in range(testLbl.size):
num = getCluster(means, np.ndarray.flatten(testData[i]))
if lblMeans[num] != testLbl[i]:
count += 1
In [27]:
lblMeans
Out[27]:
In [29]:
count
Out[29]:
In [34]:
error_rate = count / 10000 *100
In [35]:
error_rate
Out[35]:
In [39]:
def kMeans2(lbl, pre_means, data, I = 1000):
means = pre_means
for i in range(I):
preMeans = means
V = createClusters(means,data)
means = getMeans(V);
print(i)
if np.array_equal(means, preMeans):
break;
lblMeans = labelMeans(lbl, data , means)
return means, lblMeans;
In [40]:
means, lblMeans = kMeans2(trainLbl, means, data, 40)
In [41]:
lblMeans
Out[41]:
In [42]:
count = 0
for i in range(testLbl.size):
num = getCluster(means, np.ndarray.flatten(testData[i]))
if lblMeans[num] != testLbl[i]:
count += 1
In [43]:
error_rate = count / 10000 *100
error_rate
Out[43]:
In [ ]: