使用Kernel function 分类

处理线性不可分的数据
radial bias function
低维向高维的映射, 使得本来线性不可分的数据, 在高维空间变成线性可分
kernel trick or kernel substation
Gaussian kernel funcion
- $K(x,z) = exp(-||x-z||^2/2\sigma^2) $



In [127]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [166]:

    
def kernelTrans(X, A, kTup):
    '''
    Parameters: 
        X: np.array
        A: a row of np.array
        kTup: a tuple like ('rbf', 0.4)
    return:
        K: np.array(a column)
    '''
    m, n = X.shape
    K = np.zeros((m, ))
    if kTup[0] == 'lin':
        K = X*A.T
    elif kTup[0] == 'rbf':
        for j in range(m):
            deltaRow = X[j] - A
            K[j] = np.dot(deltaRow, deltaRow.T)
        K = np.exp(K/(-1*kTup[1]**2))
    else:
        raise NameError('Houston We hace a problem -- That kernel is not recognized')
    return K



In [129]:

    
def clipAlpha(alpha, High, Low):
    if alpha > High:
        alpha = High
    elif Low > alpha:
        alpha = Low
    return alpha



In [130]:

    
class optStruct:
    def __init__(self, data, labels, C, toler, kTup):
        self.X = data
        self.labels = labels
        self.C = C
        self.tol = toler
        self.m = data.shape[0]
        self.alphas = np.zeros((self.m, ))
        self.b = 0
        self.errCache = np.zeros((self.m, 2)) # store error cache
        self.K = np.zeros((self.m, self.m)) # store the kernel function's values
        for i in range(self.m):
            self.K[:, i] = kernelTrans(self.X, self.X[i], kTup)



In [131]:

    
def calculateEk(oS, k):
    fXk = np.dot(oS.alphas*oS.labels, oS.K[:, k])+ oS.b
    Ek = fXk - float(oS.labels[k])
    return Ek



In [132]:

    
def select_random(index, m):
    j = index
    while j == index:
        j = int(np.random.uniform(0, m))
    return j



In [133]:

    
def updateEk(oS, k):
    Ek = calculateEk(oS, k)
    oS.errCache[k] = [1, Ek]



In [134]:

    
def selectJ(i, oS, Ei):
    maxK = -1
    maxDeltaE = 0
    Ej = 0
    oS.errCache[i] = [1, Ei]
    validEcacheList = np.nonzero(oS.errCache[:, 0])[0]
    if (len(validEcacheList)) > 1:
        for k in validEcacheList:
            if k == i:
                continue
            Ek = calculateEk(oS, k)
            deltaE = abs(Ei - Ek)
            # pick up the max one
            if (deltaE > maxDeltaE):
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK, Ej
    else:
        # This is for the first time to choose an alpha
        j = select_random(i, oS.m)
        Ej = calculateEk(oS, j)
    return j, Ej



In [135]:

    
def innerLoop(i, oS):
    '''
    return: 
        0: no alpha Pairs are Changed
        1: Changed
    '''
    Ei = calculateEk(oS, i)
    if ((oS.labels[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or\
       ((oS.labels[i]*Ei > -oS.tol) and (oS.alphas[i] > 0)):
        j, Ej = selectJ(i, oS, Ei) # different from simpleSMO
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        if (oS.labels[i] != oS.labels[j]):
            Low = max(0, oS.alphas[j] - oS.alphas[i])
            High = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            Low = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
            High = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if Low == High:
            print "L==H"
            return 0
        eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j]
        if eta >= 0:
            print"eta>=0"
            return 0
        oS.alphas[j] -= oS.labels[j]*(Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j], High, Low)
        updateEk(oS, j)
        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
            print "j not moving enough"
            return 0
        oS.alphas[i] += oS.labels[j]*oS.labels[i]*(alphaJold - oS.alphas[j])
        updateEk(oS, i)
        b1 = oS.b - Ei - oS.labels[i]*(oS.alphas[i] - alphaIold)*oS.K[i, i] -\
             oS.labels[j]*(oS.alphas[j] - alphaJold)*oS.K[i, j]
        b2 = oS.b - Ej - oS.labels[i]*(oS.alphas[i] - alphaIold)*oS.K[i, j] -\
             oS.labels[j]*(oS.alphas[j] - alphaJold)*oS.K[j, j]
        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[j]):
            oS.b = b1
        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
            oS.b = b2
        else:
            oS.b = (b1 + b2)/2.0
        return 1
    else:
        return 0



In [167]:

    
def SMO(data, labels, C, toler, maxIter, kTup):
    '''
    returns:
        b, alphas
    '''
    oS = optStruct(data, labels, C, toler, kTup)  # kernel matrix calculated here
    iteration = 0
    entireSet = True
    alphaPairsChanged = 0
    while (iteration < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        if entireSet:
            for i in range(oS.m):
                alphaPairsChanged += innerLoop(i, oS)
            print "fullset, iter: %d, i: %d, pairs changed: %d" %(iteration, i , alphaPairsChanged)
            iteration += 1
        else:
            nonBoundIs = np.nonzero((oS.alphas > 0) * (oS.alphas < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerLoop(i ,oS)
                print "non-bound, iter: %d, i: %d, pairs changed: %d" %(iteration, i , alphaPairsChanged)
            iteration += 1
        if entireSet:
            entireSet = False
        elif (alphaPairsChanged == 0):
            entireSet = True
        print "Iteraion number: %d" %iteration
    return oS.b, oS.alphas



In [157]:

    
def testRBF(k1=1.3):
    # read data using pandas
    df = pd.read_csv('testSetRBF.txt', delimiter='\t', names=['x1', 'x2', 'y'])
    data = df.loc[:, ['x1', 'x2']].values
    labels = df.loc[:, 'y'].values
    
    # using SMO with racial bias function to obtain b and alphas
    b, alphas = SMO(data, labels, 200, 0.0001, 10000, ('rbf', k1))
    
    # find support vectors
    svInd = np.nonzero(alphas > 0)[0]
    sVs = data[svInd]
    labelSV = labels[svInd]
    print "there are %d Support Vectors" % sVs.shape[0]
    
    # calculate the precision of training set
    m, n = data.shape
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, data[i], ('rbf', k1))
        predict = np.dot(kernelEval.T, labelSV*alphas[svInd]) + b
        if np.sign(predict) != np.sign(labels[i]):
            errorCount += 1
    print "the training error rate is: %f" %(float(errorCount)/m)
    
    # read the test data and calculate the precision of test data
    df = pd.read_csv('testSetRBF2.txt', delimiter='\t', names=['x1', 'x2', 'y'])
    data = df.loc[:, ['x1', 'x2']].values
    labels = df.loc[:, 'y'].values
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, data[i], ('rbf', k1))
        predict = np.dot(kernelEval, labelSV*alphas[svInd]) + b
        if np.sign(predict) != np.sign(labels[i]):
            errorCount += 1
    print "the test error rate is: %f" % (float(errorCount)/m)



In [158]:

    
K = np.zeros((100,100))



In [159]:

    
df = pd.read_csv('testSetRBF.txt', delimiter='\t', names=['x1', 'x2', 'y'])
data = df.loc[:, ['x1', 'x2']].values
labels = df.loc[:, 'y'].values



In [165]:

    
testRBF(k1=0.4)









    



L==H
L==H
L==H
L==H
L==H
L==H
L==H
L==H
L==H
L==H
L==H
L==H
j not moving enough
j not moving enough
L==H
j not moving enough
j not moving enough
j not moving enough
j not moving enough
L==H
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
fullset, iter: 0, i: 99, pairs changed: 21
Iteraion number: 1
j not moving enough
non-bound, iter: 1, i: 0, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 2, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 3, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 4, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 6, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 9, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 10, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 11, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 14, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 15, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 17, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 18, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 21, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 23, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 24, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 27, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 34, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 45, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 48, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 53, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 56, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 58, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 74, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 76, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 85, pairs changed: 0
j not moving enough
non-bound, iter: 1, i: 87, pairs changed: 0
Iteraion number: 2
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
L==H
L==H
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
L==H
j not moving enough
j not moving enough
j not moving enough
L==H
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
j not moving enough
fullset, iter: 2, i: 99, pairs changed: 0
Iteraion number: 3
there are 26 Support Vectors
the training error rate is: 0.010000
the test error rate is: 0.050000

Summary

debug花了很长时间, 最后才找到是计算的时候符号打错了...忧伤....
Kernel method与线性方法相比主要就是在SMO中, 要在内环调用kernel function的值替换原始数据值.
- 这个值在使用数据生成class时已经计算好.



In [ ]: