In [173]:
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 11 20:24:22 2017
@author: zhonghao
"""
import numpy as np
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] ## ARRAY.shape = (row, column) ARRAY.shape[0] = row
diffMat = tile(inX, (dataSetSize,1)) - dataSet ## tile(target, (row, col)) --> duplicate target in row #row times
sqDiffMat = diffMat**2 ## such ele(i) --> f(ele(i)) ONLY FOR array
sqDistances = sqDiffMat.sum(axis=1) ## ONLY for array, axis= 0 <=> summing up all ROWs
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() ## small->big feedback index from 0 to (n-1)
## Above part sequence 'distance between target&dataSet' and gives coresponding index
classCount={}
for i in range(k): # (0,1,2,... k-1)
voteIlabel = labels[sortedDistIndicies[i]] # get labels of 1st nearest, 2nd nearest, ...
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 ## statistic jobs using dic, count *each label \
## appearance in the first K nearests.
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) ## see below line
## reverse --> we need the most Frequent One, thus Reversed.
## key --> set the pivot by which we sorting, in our case, it's the frequence
## operator.itemgetter(1) --> assign (key[0-position], value[1-position]) 1-position as pivotal
return sortedClassCount[0][0] ## the most frequent one {key, value} <=> sortedClassCount[0]
## we only need its key(the label), thus the second [0] (we don't care about frequence)
def file2matrix(filename):
with open(filename, 'r') as fr: #### with open('txt','r') as txtor: is better
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
index = 0
classLabelVector = [] #prepare labels return
for line in fr.readlines():
line = line.strip() # split the line and get valid elements forming an array
listFromLine = line.split('\t') # split by "tab" and form a list
returnMat[index,:] = listFromLine[0:3] ## get the [0,1,2] as features
index += 1
classLabelVector.append(int(listFromLine[-1])) # label is located at the END of rows
return returnMat,classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0) ## min of each column <=> 0 mode
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet)) ## shape(dataSet) == dataSet.shape
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1)) ## delete bias
normDataSet = normDataSet/tile(ranges, (m,1)) ## normalization
return normDataSet, ranges, minVals
## Below Un-learnt
def datingClassTest():
hoRatio = 0.50 #hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
def classifyPerson():
resultList = ['no!!', 'little', 'rave']
percentTats = float(raw_input('time of Games=?'))
ffMile = float(raw_input('freq Flye Miles / yeear=?'))
iceCream = float(raw_input('ice Cream consumed / year=?'))
datingDatMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDatMat)
inArr = array([ffMile, percentTats, iceCream])
classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels, 3)
print(resultList[classifierResult -1])
In [184]:
dataSet = array([[1.0,10, 100],[1.0,12, 110],[0,9, 120],[0,15, 140]])
In [180]:
dataSet.min(1)
Out[180]:
In [185]:
autoNorm(dataSet)
Out[185]:
In [181]:
shape(dataSet)
Out[181]:
In [183]:
dataSet.shape
Out[183]:
In [ ]: