In [23]:
# -*- coding: utf-8 -*-
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [16]:
df = pd.DataFrame({'x1': [1.0, 1.0, 0, 0], 'x2': [1.1, 1.0, 0, 0.1], 'Labels': ['A', 'A', 'B', 'B']})
In [17]:
df
Out[17]:
In [18]:
group = df.loc[:, ['x1', 'x2']].values
In [21]:
labels = df.Labels.values
In [39]:
def classify_kNN(inX, dataSet, labels, k):
'''
Parameters:
inX: inputdata (numpy, shape=(1, 2))
dataSet: train dataset(numpy)
lables: the labels corresponding dataSet
k: number of nearest neighbors(int)
Return:
sortedClassCount[0][0]: inX's label
'''
# Calculate distance
diffMat = inX - dataSet
sqDiffMat = diffMat**2
sqDistance = sqDiffMat.sum(axis=1)
distances = sqDistance**0.5
# Sort the distance
sortedDistIndices = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
In [40]:
classify_kNN([0, 0], group, labels, 3)
Out[40]:
In [41]:
df = pd.read_csv('datingTestSet.txt', names=['x1', 'x2', 'x3', 'Labels'], sep='\t')
In [43]:
df.head()
Out[43]:
In [44]:
datingDataMat = df.loc[:, ['x1', 'x2', 'x3']].values
In [47]:
labels = df.Labels.values
In [61]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2])
Out[61]:
In [80]:
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = (dataSet - minVals)/ranges
return normDataSet, ranges, minVals
In [81]:
normMat, ranges, minVals = autoNorm(datingDataMat)
In [ ]: