notebook.community

Edit and run



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize
%matplotlib inline



In [2]:

    
# Make an artificial dataset.
Npos = Nneg = 50
N = Npos + Nneg

X = np.vstack([np.random.randn(Npos, 2) - 3.0, np.random.randn(Nneg, 2) - 1.0, np.random.randn(Npos, 2) + 1.0, np.random.randn(Nneg, 2) + 3.0])
y = np.hstack([+np.ones(Npos), -np.ones(Nneg), +np.ones(Npos), -np.ones(Nneg)])



In [3]:

    
# Visualize
(fig, axs) = plt.subplots(1,1)
ax = axs
ax.scatter(X[y==+1, 0], X[y==+1, 1], color='red', marker='.')
ax.scatter(X[y==-1, 0], X[y==-1, 1], color='blue', marker='.')
ax.axis('equal')
ax.grid()



In [4]:

    
def SolveKernelSVM(K, y, C):
    y1 = y.reshape(-1, 1)
    Y = np.dot(y1, y1.T)
    G = K * Y
    alp0 = np.zeros(y.size)
    
    # Objective function
    func = lambda alpha : 0.5 * alpha.dot(G).dot(alpha) - alpha.sum()
    func_deriv = lambda alpha : alpha.dot(G) - np.ones_like(alpha)
    # Constraints
    cons = [
        {'type':'eq', 'fun':lambda alpha : alpha.dot(y), 'jac':lambda alpha:y },
        {'type':'ineq', 'fun':lambda alpha : alpha, 'jac':lambda alpha:np.eye(alpha.size) },
        {'type':'ineq', 'fun':lambda alpha : C-alpha, 'jac':lambda alpha:-np.eye(alpha.size) },
    ]
    # Solve using SLSQP optimization algorithm.
    res = scipy.optimize.minimize(func, alp0, method='SLSQP', constraints=cons, jac=func_deriv, options={'disp':True, 'maxiter':int(1e4)})
    alp = res.x
    return alp



In [5]:

    
def DistanceMatrix(X, Y):
    ''' Compute distance matrix among data matrices X, Y, whose row-vector is representating a sample.'''
    assert( X.shape[1] == Y.shape[1] ) # Check if the datasets have the same dimension.
    nx, ny = [ (M**2).sum(axis=1).reshape(-1,1) for M in [X, Y] ]
    In, Im = [ np.ones(M.shape[0]).reshape(-1,1) for M in [X, Y] ]
    D = np.dot(nx, Im.T) + np.dot(In, ny.T) - 2 * np.dot(X, Y.T)
    return D



In [6]:

    
def KernelMatrixLinear(X, Y):
    K = np.dot(X, Y.T)
    return K



In [7]:

    
def KernelMatrixPolynomial(X, Y, p):
    K = np.power(np.dot(X, Y.T) + 1, p)
    return K



In [8]:

    
def KernelMatrixRBF(X, Y, gamma):
    D = DistanceMatrix(X, Y)
    K = np.exp(-gamma * D)
    return K



In [9]:

    
def KernelMatrix(X, Y):
    gamma = 0.1
    return KernelMatrixRBF(X, Y, gamma)



In [10]:

    
def PlotHeatMap(ax, KernelMatrix, alp, y, b):
    smin, smax = -5, 5 # definition of range of samples.
    nMesh = 101 # The number of meshes to divide.
    Xm, Ym = np.meshgrid( *[np.linspace(smin, smax, nMesh) for _ in range(2)] )
    XX = np.zeros((2, nMesh, nMesh))
    XX[0], XX[1] = Xm, Ym
    X2 = XX.reshape((2,-1)).T # Feature vectors.
    K2 = KernelMatrix(X, X2) # Kernel Matrix
    Y2 = np.dot(alp*y, K2) + b # Prediction
    Zm = Y2.reshape((nMesh, nMesh)) # Reshape into the mesh-grid style.
    
    ax.pcolor(Xm, Ym, Zm, cmap='jet')
    cont = ax.contour(Xm, Ym, Zm, levels=[-1.0, 0.0, 1.0], colors='k')
    cont.clabel(fmt='%1.1f', fontsize=14, inline=1)
    
    ax.scatter(X[y==+1, 0], X[y==+1, 1], color='red', marker='.')
    ax.scatter(X[y==-1, 0], X[y==-1, 1], color='blue', marker='.')

    ax.grid()
    ax.set_xlim(smin, smax)
    ax.set_ylim(smin, smax)



In [11]:

    
%%time
Cs = [1e-1, 1e0, 1e1]
gammas = [1e-1, 1e0, 1e1]

nCols = len(Cs)
nRows = len(gammas)
figSize = 5


(fig, axs) = plt.subplots(nRows, nCols, figsize=(figSize*nCols, figSize*nRows) )
for i in range(nRows):
    for j in range(nCols):
        ax = axs[i, j]
        gamma = gammas[i]
        C = Cs[j]
        
        # Definition of a kernel function.
        KernelMatrix = lambda X, Y : KernelMatrixRBF(X, Y, gamma)
        
        # Train SVM and compute parameters.
        K = KernelMatrix(X, X)
        alp = SolveKernelSVM(K, y, C)
        k = np.argmin(np.abs(C/2 - alp))
        b = y[k] - (y*alp).dot(K[k])
        
        # Plot
        ax.set_title("$\gamma$ = %.1f, C = %.1f" % (gamma, C) )
        PlotHeatMap(ax, KernelMatrix, alp, y, b)









    



Optimization terminated successfully.    (Exit mode 0)
            Current function value: -13.5704411803
            Iterations: 12
            Function evaluations: 12
            Gradient evaluations: 12
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -79.105852985
            Iterations: 33
            Function evaluations: 36
            Gradient evaluations: 33
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -619.782644514
            Iterations: 78
            Function evaluations: 83
            Gradient evaluations: 78
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -13.0920956571
            Iterations: 16
            Function evaluations: 17
            Gradient evaluations: 16
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -64.04808282
            Iterations: 52
            Function evaluations: 58
            Gradient evaluations: 52
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -450.919166313
            Iterations: 95
            Function evaluations: 106
            Gradient evaluations: 95
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -18.0508117102
            Iterations: 2
            Function evaluations: 2
            Gradient evaluations: 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -78.691268361
            Iterations: 38
            Function evaluations: 42
            Gradient evaluations: 38
Optimization terminated successfully.    (Exit mode 0)
            Current function value: -216.411883592
            Iterations: 42
            Function evaluations: 43
            Gradient evaluations: 42
CPU times: user 33.5 s, sys: 780 ms, total: 34.3 s
Wall time: 18.7 s

Memo

We can observe the following characteristics from the result above :

The more C parameter increases, the more the form complexity of boundary increases.
Increasing $\gamma$ makes the generalization performance be worse.