notebook.community

Edit and run



In [173]:

    
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 11 20:24:22 2017

@author: zhonghao
"""

import numpy as np
import operator


def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels


def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]    ## ARRAY.shape = (row, column) ARRAY.shape[0] = row
    diffMat = tile(inX, (dataSetSize,1)) - dataSet  ## tile(target, (row, col)) --> duplicate target in row #row times
    sqDiffMat = diffMat**2   ## such ele(i) --> f(ele(i)) ONLY FOR array
    sqDistances = sqDiffMat.sum(axis=1)  ## ONLY for array, axis= 0 <=> summing up all ROWs
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()    ## small->big feedback index from 0 to (n-1) 
    ## Above part sequence 'distance between target&dataSet' and gives coresponding index
    
    classCount={}          
    for i in range(k):  # (0,1,2,... k-1)
        voteIlabel = labels[sortedDistIndicies[i]]   # get labels of 1st nearest, 2nd nearest, ... 
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1   ##  statistic jobs using dic, count *each label \
        ## appearance in the first K nearests. 
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) ## see below line
    ## reverse --> we need the most Frequent One, thus Reversed.
    ## key --> set the pivot by which we sorting, in our case, it's the frequence
        ## operator.itemgetter(1) --> assign (key[0-position], value[1-position])  1-position as pivotal
    
    return sortedClassCount[0][0]   ## the most frequent one {key, value} <=> sortedClassCount[0]
                                    ## we only need its key(the label), thus the second [0] (we don't care about frequence)

    
def file2matrix(filename):
                      
    with open(filename, 'r') as fr:     #### with open('txt','r') as txtor: is better
        numberOfLines = len(fr.readlines())         #get the number of lines in the file
        returnMat = zeros((numberOfLines,3))        #prepare matrix to return
        index = 0
        classLabelVector = []     #prepare labels return   
        for line in fr.readlines():
            line = line.strip()   # split the line and get valid elements forming an array
            listFromLine = line.split('\t') # split by "tab" and form a list
            returnMat[index,:] = listFromLine[0:3]  ## get the [0,1,2] as features
            index += 1
            classLabelVector.append(int(listFromLine[-1]))  # label is located at the END of rows
        
    return returnMat,classLabelVector


def autoNorm(dataSet):
    minVals = dataSet.min(0)    ## min of each column <=> 0 mode
    maxVals = dataSet.max(0)    
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))   ## shape(dataSet) == dataSet.shape
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))  ## delete bias
    normDataSet = normDataSet/tile(ranges, (m,1))   ## normalization
    return normDataSet, ranges, minVals

## Below Un-learnt
def datingClassTest():
    hoRatio = 0.50      #hold out 10%
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print errorCount

    
def classifyPerson():
    resultList = ['no!!', 'little', 'rave']
    percentTats = float(raw_input('time of Games=?'))
    ffMile = float(raw_input('freq Flye Miles / yeear=?'))
    iceCream = float(raw_input('ice Cream consumed / year=?'))
    datingDatMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDatMat)
    inArr = array([ffMile, percentTats, iceCream])
    classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels, 3)
    print(resultList[classifierResult -1])



In [184]:

    
dataSet =  array([[1.0,10, 100],[1.0,12, 110],[0,9, 120],[0,15, 140]])



In [180]:

    
dataSet.min(1)









    Out[180]:





array([ 1.,  1.,  0.,  0.])



In [185]:

    
autoNorm(dataSet)









    Out[185]:





(array([[ 1.        ,  0.16666667,  0.        ],
        [ 1.        ,  0.5       ,  0.25      ],
        [ 0.        ,  0.        ,  0.5       ],
        [ 0.        ,  1.        ,  1.        ]]),
 array([  1.,   6.,  40.]),
 array([   0.,    9.,  100.]))



In [181]:

    
shape(dataSet)









    Out[181]:





(4L, 3L)



In [183]:

    
dataSet.shape









    Out[183]:





(4L, 3L)



In [ ]: