In [357]:
import numpy as np
from matplotlib import pylab as plt
import operator
import pandas as pd
%matplotlib inline
In [572]:
df = pd.read_csv("data/datingTestSet.txt", delimiter="\t", header=None)
X = np.array(df.iloc[:, :3])
Y = np.array(df.iloc[:, 3:])
In [573]:
class KNN(object):
def __init__(self):
super(KNN, self).__init()
@classmethod
def normalize(cls, features):
fmax = features.max(axis=0)
fmin = features.min(axis=0)
features = (features - fmin) / (fmax - fmin)
return features, fmin, fmax
@classmethod
def classify(cls, features, labels, x, k=5):
datesize = features.shape
distances = np.tile(x, (datesize[0], 1)) - features
distances = np.power(distances, 2)
minDistanceIDs = distances.sum(axis=1).argsort()[:k]
neighbors = {}
for i in minDistanceIDs:
label = np.asscalar(labels[i])
neighbors[label] = neighbors.get(label, 0) + 1
#print neighbors
sortedClassCount = sorted(
neighbors.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
In [564]:
q = np.array([7.00960000e+04,1.09659260e+01, 1.21232800e+00])
KNN.classify(X, Y, q, k=6)
Out[564]:
In [620]:
def test(features, labels):
tp = 0
tp2 = 0
datasize = features.shape
features, fmin, fmax = KNN.normalize(features)
testNumber = 200
for i in range(testNumber):
predicted = KNN.classify(
training_x[:-testNumber],
training_y[:-testNumber],
training_x[testNumber+i],
k=3)
y = training_y[testNumber+i]
if predicted != y:
tp += 1
return float(tp) / testNumber
print "error rate: ", test(X, Y)
In [623]:
def readDigits(path, sampleSize=10):
# (number of digit * sample size, feature size=width*height)
X = np.ndarray((10 * sampleSize, 32 * 32))
Y = np.ndarray((10 * sampleSize, 1))
for i in range(10):
for s in range(sampleSize):
with open("{}{}_{}.txt".format(path, i, s)) as f:
for c, line in enumerate(f):
colPos = c * 32
X[i+s, colPos:colPos + 32] = list(line.strip())
Y[i*sampleSize: (i+1) * sampleSize] = i
return X, Y
X, Y = readDigits("data/trainingDigits/", 2)
In [675]:
def testDigits():
X, Y = readDigits("data/trainingDigits/", 150)
test_X, test_Y = readDigits("data/testDigits/", 3)
tp = 0
datasize = X.shape
for test_x, test_y in zip(test_X, test_Y):
predicted = KNN.classify(X, Y, test_x, k=5)
#print test_y[0], predicted
if predicted != test_y:
tp += 1
return float(tp) / len(test_X)
print "error rate: ", testDigits()
In [686]:
def testDigits():
X, Y = readDigits("data/trainingDigits/", 40)
test_X, test_Y = readDigits("data/testDigits/", 2)
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X, Y)
print test_Y
print nbrs.kneighbors(test_X)
print "error rate: ", testDigits()
In [704]:
a = np.array([
[53, 1],
[2, 17],
[4, 12],
[2, 9]
])
U, S, VT = np.linalg.svd(a,full_matrices=True)
print U
print S
print VT
In [733]:
a = np.array([
[12, 12, 2, 1],
[17, 17, 4, 9],
[4, 12, 1, 5],
[2, 9, 1, 1]
])
U, S, VT = np.linalg.svd(a,full_matrices=True)
print U
print S
print VT
In [737]:
k = 4
U[:,:k] * np.mat(np.diag(S)[:k, :k]) * VT[:k,:]
Out[737]:
In [738]:
U[:,:k]
Out[738]:
In [739]:
VT[:k,:]
Out[739]:
In [729]:
np.diag(S)[:k, :k]
Out[729]:
In [746]:
U[:,:1] * a[:, :1]
Out[746]:
In [ ]: