In [357]:
import numpy as np
from matplotlib import pylab as plt
import operator
import pandas as pd
%matplotlib inline

In [572]:
df = pd.read_csv("data/datingTestSet.txt", delimiter="\t", header=None)
X = np.array(df.iloc[:, :3])
Y = np.array(df.iloc[:, 3:])

In [573]:
class KNN(object):
    def __init__(self):
        super(KNN, self).__init()

    @classmethod
    def normalize(cls, features):
        fmax = features.max(axis=0)
        fmin = features.min(axis=0)
        features = (features - fmin) / (fmax - fmin)
        return features, fmin, fmax
    
    @classmethod
    def classify(cls, features, labels, x, k=5):
        datesize = features.shape
        distances = np.tile(x, (datesize[0], 1)) - features
        distances = np.power(distances, 2)

        minDistanceIDs = distances.sum(axis=1).argsort()[:k]
        neighbors = {}

        for i in minDistanceIDs:
            label = np.asscalar(labels[i])
            neighbors[label] = neighbors.get(label, 0) + 1
        #print neighbors
        sortedClassCount = sorted(
            neighbors.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]

In [564]:
q = np.array([7.00960000e+04,1.09659260e+01, 1.21232800e+00])
KNN.classify(X, Y, q, k=6)


Out[564]:
'didntLike'

In [620]:
def test(features, labels):
    tp = 0
    tp2 = 0
    datasize = features.shape
    features, fmin, fmax = KNN.normalize(features)
    testNumber = 200
    
    for i in range(testNumber):
        predicted = KNN.classify(
            training_x[:-testNumber], 
            training_y[:-testNumber], 
            training_x[testNumber+i], 
            k=3)
        
        y = training_y[testNumber+i]
        
        if predicted != y:
            tp += 1        

    return float(tp) / testNumber

print "error rate: ", test(X, Y)


error rate:  0.115

In [623]:
def readDigits(path, sampleSize=10):
    # (number of digit * sample size, feature size=width*height)
    X = np.ndarray((10 * sampleSize, 32 * 32)) 
    Y = np.ndarray((10 * sampleSize, 1))
    for i in range(10):
        for s in range(sampleSize):
            with open("{}{}_{}.txt".format(path, i, s)) as f:
                for c, line in enumerate(f):
                    colPos = c * 32
                    X[i+s, colPos:colPos + 32] = list(line.strip())
        Y[i*sampleSize: (i+1) * sampleSize] = i

    return X, Y

X, Y = readDigits("data/trainingDigits/", 2)

In [675]:
def testDigits():
    X, Y = readDigits("data/trainingDigits/", 150)
    test_X, test_Y = readDigits("data/testDigits/", 3)
    tp = 0
    datasize = X.shape
    
    for test_x, test_y in zip(test_X, test_Y):
        predicted = KNN.classify(X, Y, test_x, k=5)
        #print test_y[0], predicted
        if predicted != test_y:
            tp += 1

    return float(tp) / len(test_X)

print "error rate: ", testDigits()


error rate:  0.8

In [686]:
def testDigits():
    X, Y = readDigits("data/trainingDigits/", 40)
    test_X, test_Y = readDigits("data/testDigits/", 2)
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X, Y)
    print test_Y
    print nbrs.kneighbors(test_X)
print "error rate: ", testDigits()


error rate:  [[ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 2.]
 [ 2.]
 [ 3.]
 [ 3.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 6.]
 [ 6.]
 [ 7.]
 [ 7.]
 [ 8.]
 [ 8.]
 [ 9.]
 [ 9.]]
(array([[ 12.40967365,  12.52996409],
       [ 14.10673598,  14.28285686],
       [ 14.10673598,  14.56021978],
       [ 12.08304597,  12.16552506],
       [ 10.77032961,  10.90871211],
       [ 11.83215957,  12.68857754],
       [ 13.49073756,  14.86606875],
       [ 13.03840481,  13.15294644],
       [ 11.61895004,  12.64911064],
       [  9.64365076,  10.04987562],
       [ 11.04536102,  11.18033989],
       [  9.89949494,  11.18033989],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ],
       [  0.        ,   0.        ]]), array([[  0,   6],
       [  1,  45],
       [  2,  43],
       [ 27,   3],
       [ 31,  11],
       [  5,  28],
       [  6,   8],
       [ 22,  21],
       [  8,   6],
       [ 13,  15],
       [ 16,  11],
       [ 16,  34],
       [198,  97],
       [198,  97],
       [198,  97],
       [198,  97],
       [198,  97],
       [198,  97],
       [198,  97],
       [198,  97]]))
None

In [704]:
a = np.array([
        [53, 1],
        [2, 17],
        [4, 12],
        [2, 9]
    ])

U, S, VT = np.linalg.svd(a,full_matrices=True)
print U
print S
print VT


[[-0.99308301  0.11026755 -0.03899004 -0.01034343]
 [-0.05833671 -0.74906615 -0.52732983 -0.39676186]
 [-0.08961563 -0.52119915  0.84024944 -0.11958829]
 [-0.04849752 -0.39381773 -0.1199329   0.91003939]]
[ 53.32030733  22.47097741]
[[-0.99784741 -0.06557862]
 [ 0.06557862 -0.99784741]]

In [733]:
a = np.array([
        [12, 12, 2, 1],
        [17, 17, 4, 9],
        [4, 12, 1, 5],
        [2, 9, 1, 1]
    ])

U, S, VT = np.linalg.svd(a,full_matrices=True)
print U
print S
print VT


[[-0.48552086  0.30425039 -0.7530737  -0.32339017]
 [-0.75301257  0.32710408  0.49684413  0.28128438]
 [-0.37295365 -0.68251946  0.21788421 -0.58957547]
 [-0.2411372  -0.57844775 -0.37222626  0.68461571]]
[ 34.2353521    6.84438172   4.66426921   0.58283658]
[[-0.60176255 -0.73821769 -0.13428175 -0.27365103]
 [ 0.77798126 -0.61137578  0.09583771 -0.10853543]
 [-0.09936173 -0.28428032  0.07008324  0.95099963]
 [-0.15084505 -0.02095235  0.9838048  -0.09452452]]

In [737]:
k = 4
U[:,:k] * np.mat(np.diag(S)[:k, :k]) * VT[:k,:]


Out[737]:
matrix([[ 12.,  12.,   2.,   1.],
        [ 17.,  17.,   4.,   9.],
        [  4.,  12.,   1.,   5.],
        [  2.,   9.,   1.,   1.]])

In [738]:
U[:,:k]


Out[738]:
array([[-0.48552086,  0.30425039, -0.7530737 , -0.32339017],
       [-0.75301257,  0.32710408,  0.49684413,  0.28128438],
       [-0.37295365, -0.68251946,  0.21788421, -0.58957547],
       [-0.2411372 , -0.57844775, -0.37222626,  0.68461571]])

In [739]:
VT[:k,:]


Out[739]:
array([[-0.60176255, -0.73821769, -0.13428175, -0.27365103],
       [ 0.77798126, -0.61137578,  0.09583771, -0.10853543],
       [-0.09936173, -0.28428032,  0.07008324,  0.95099963],
       [-0.15084505, -0.02095235,  0.9838048 , -0.09452452]])

In [729]:
np.diag(S)[:k, :k]


Out[729]:
array([[ 32.63308966,   0.        ],
       [  0.        ,   6.78833256]])

In [746]:
U[:,:1] * a[:, :1]


Out[746]:
array([[ -5.82625026],
       [-12.80121364],
       [ -1.49181458],
       [ -0.48227441]])

In [ ]: