Setting up the environment (do not use pylab)
$\newcommand{\dotprod}[2]{\langle #1 , #2 \rangle}$
In [ ]:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import gzip
%matplotlib inline
In [ ]:
def normalise_01(K):
"""Normalise values of kernel matrix to have smallest value 0 and largest value 1."""
smallest = np.min(K)
largest = np.max(K)
return (K - smallest)/(largest-smallest)
In the following, we use the fact that kernels ($k(\cdot, \cdot)$) are inner products in a feature space with feature mapping $\phi(\cdot)$: $$k(x,y) = \dotprod{\phi(x)}{\phi(y)}$$
Centering causes the mean of the data set to be the zero vector in feature space. For more details, refer to Chapter 5 of the book Kernel Methods for Pattern Analysis
Since $\phi(x)$ lives in a vector space, we can compute the mean vector as usual $$ \mu = \frac{1}{n}\sum_{i=1}^n \phi(x_i) $$ and subtract is off from each data point $$ \hat{\phi}(x) = \phi(x) - \mu. $$ Hence the normalised kernel can be computed with a bit of algebra \begin{align*} \hat{k}(x,y) &= \dotprod{\hat{\phi}(x)}{\hat{\phi}(y)}\\ &= \dotprod{\phi(x) - \mu}{\phi(y) - \mu} \end{align*} by recalling the definition of $\mu$ and expanding the quadratic.
In [ ]:
def center(K):
"""Center the kernel matrix, such that the mean (in feature space) is zero."""
one_mat = np.matrix(np.ones(K.shape))
one_vec = np.matrix(np.ones((K.shape[0],1)))
row_sum = np.matrix(np.mean(K,axis=0)).T
R = K - row_sum * one_vec.T - one_vec * row_sum.T +\
np.mean(row_sum.A)*one_mat
return R
It is often convenient to have all the examples to be represented by vectors of the same length. This implies that the diagonal of the kernel matrix (the squared length) is the same for all examples. We arbitrarily (without loss of generality) set this length to 1. \begin{align*} \hat{k}(x,y) &= \dotprod{\frac{\phi(x)}{\|\phi(x)\|}}{\frac{\phi(y)}{\|\phi(y)\|}}\\ &= \frac{1}{\|\phi(x)\|\|\phi(y)\|}\dotprod{\phi(x)}{\phi(y)}\\ &= \frac{1}{\|\phi(x)\|\|\phi(y)\|} k(x,y) \end{align*}
Normalizing the kernel matrix such that it has one along the diagonal is sometimes called trace normalisation or spherical normalisation.
In [ ]:
def normalise_unit_diag(K):
"""Normalise the kernel matrix, such that all diagonal entries are 1."""
Kii = np.diag(K)
Kii.shape = (len(Kii),1)
return np.divide(K, np.sqrt(np.matrix(Kii)*np.matrix(Kii).T))
We would like to enforce the fact that the variance of the vectors in feature space is 1. $$\frac{1}{n}\sum_{i=1}^n \|\hat{\phi}(x) - \mu \|^2 = 1$$ where $\mu$ is the mean as defined in the centering section. In terms of kernels, this is $$\frac{1}{n}\sum_{i=1}^n \hat{k}(x_i,x_i) - \frac{1}{n^2}\sum_{i=1}^n\sum_{j=1}^n \hat{k}(x_i,x_j)= 1$$ so the final normalisation rule is $$ \hat{k}(x_i,x_j) = \frac{k(x_i,x_j)}{\frac{1}{n}\sum_{i=1}^n k(x_i,x_i) - \frac{1}{n^2}\sum_{i=1}^n\sum_{j=1}^n k(x_i,x_j)} $$
Note that in case the kernel is centered, the above rule simplifies to $$\hat{k}(x_i, x_j) = \frac{k(x_i,x_j)}{\frac{1}{n}\mathrm{tr}(K)}$$ where $\mathrm{tr}(K) = \sum_{i=1}^n k(x_i ,x_i)$ is the trace of the kernel matrix $K$.
Reference: Kloft, Brefeld, Sonnenburg, Zein, "$\ell_p$-Norm Multiple Kernel Learning" Journal of Machine Learning Research 12 (2011) 953-997
In [ ]:
def normalise_variance(K):
"""Normalise the kernel matrix, such that the variance (in feature space) is 1"""
one_vec = np.matrix(np.ones((K.shape[0],1)))
inv_sqrt_diag = np.divide(one_vec, np.matrix(np.sqrt(np.diag(K))).T)
KN = np.multiply(np.kron(one_vec.T,inv_sqrt_diag),K)
KN = np.multiply(np.kron(one_vec,inv_sqrt_diag.T),K)
return KN
Since kernels always have dot products in some corresponding feature space, there is a associated distance. Recall that a kernel is given by: $$k(x,y) = \dotprod{\phi(x)}{\phi(y)}$$ Consider the squared distance in the feature space: \begin{align*} \| \phi(x) - \phi(y)\|^2 &= \dotprod{\phi(x)}{\phi(x)}+ \dotprod{\phi(y)}{\phi(y)}- 2\dotprod{\phi(x)}{\phi(y)}\\ &= k(\phi(x),\phi(x)) + k(\phi(y),\phi(y))- 2k(\phi(x),\phi(y)) \end{align*} hence the corresponding distance is given by: $$ d(x,y) = \sqrt{k(\phi(x),\phi(x)) + k(\phi(y),\phi(y))- 2k(\phi(x),\phi(y))} $$ Since human intuition seems to work better with distances (as compared to similarities), we also show the matrices for distances.
In [ ]:
def kernel2distance(K):
"""Convert the kernel matrix into the corresponding non-Euclidean distance."""
D = np.zeros(K.shape)
for ix in range(K.shape[0]):
for iy in range(K.shape[1]):
sqr_dist = K[ix,ix] + K[iy,iy] - 2*K[ix,iy]
if sqr_dist > 0.0:
D[ix,iy] = np.sqrt(sqr_dist)
return D
In [ ]:
def cloud_gen(num_feat, num_points, centers, width):
"""Generate Gaussian point clouds"""
total_points = np.sum(num_points)
data = np.zeros((num_feat, total_points))
start_idx = 0
for ix, center in enumerate(centers):
C = np.array(center).copy()
C.shape = (len(center),1)
cur_data = C*np.ones((num_feat, num_points[ix])) + width*np.random.randn(num_feat, num_points[ix])
end_idx = start_idx + num_points[ix]
data[:,start_idx:end_idx] = cur_data.copy()
start_idx = end_idx
return data
centers = [[1,1,1],[0,0,np.sqrt(3)],[0,0,0]]
X = cloud_gen(3, [10,15,25], centers, 0.3)
print('Shape of data')
print(X.shape)
raw_kmat = np.dot(X.T,X)
For visualising kernel matrices it is important to choose good color maps.
The left column shows the kernel matrix, and the right column the distance matrix. Each row shows respectively:
In [ ]:
fig = plt.figure(figsize=(12,28))
ax = fig.add_subplot(521)
im = ax.matshow(raw_kmat, cmap=cm.winter)
ax.set_title('original kernel')
fig.colorbar(im)
ax = fig.add_subplot(522)
ax.set_title('original distance')
im = ax.matshow(kernel2distance(raw_kmat), cmap=cm.autumn)
fig.colorbar(im)
ax = fig.add_subplot(523)
im = ax.matshow(normalise_01(raw_kmat), cmap=cm.winter)
ax.set_title('normalise [0,1]')
fig.colorbar(im)
ax = fig.add_subplot(524)
im = ax.matshow(kernel2distance(normalise_01(raw_kmat)), cmap=cm.autumn)
fig.colorbar(im)
ax = fig.add_subplot(525)
im = ax.matshow(center(raw_kmat), cmap=cm.winter)
ax.set_title('zero mean in feature space')
fig.colorbar(im)
ax = fig.add_subplot(526)
im = ax.matshow(kernel2distance(center(raw_kmat)), cmap=cm.autumn)
fig.colorbar(im)
ax = fig.add_subplot(527)
im = ax.matshow(normalise_unit_diag(center(raw_kmat)), cmap=cm.winter)
ax.set_title('Ones along the diagonal')
fig.colorbar(im)
ax = fig.add_subplot(528)
im = ax.matshow(kernel2distance(normalise_unit_diag(center(raw_kmat))), cmap=cm.autumn)
fig.colorbar(im)
ax = fig.add_subplot(529)
im = ax.matshow(normalise_variance(center(raw_kmat)), cmap=cm.winter)
ax.set_title('Unit variance in feature space')
fig.colorbar(im)
ax = fig.add_subplot(5,2,10)
im = ax.matshow(kernel2distance(normalise_variance(center(raw_kmat))), cmap=cm.autumn)
fig.colorbar(im)
In [ ]: