Deep Hypersphere Learning


In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
g_cos = np.cos

def g_sigmoid(t, k=1.0):
    etk = np.exp( (t - np.pi / 2) / k )
    e0k = np.exp( - np.pi / 2 / k )
    y = ( (1 - etk) * (1 + e0k) ) / ( (1 + etk) * (1 - e0k) )
    return y

In [3]:
def g_sigmoid_grad(t, k=1.0):
    etk = np.exp((t - np.pi/2) / k)
    e0k = np.exp((0. - np.pi/2) / k)
    y = -2 * (1 + e0k) * etk / (1 - e0k) / (1 + etk) ** 2 / k
    return y

Chainer Implementation


In [4]:
import chainer
import chainer.functions as F
import chainer.links as L

In [5]:
class AngleFunction(chainer.Function):
    def forward(self, inputs):
        xp = chainer.cuda.get_array_module(*inputs)
        norm = xp.linalg.norm
        x, w = inputs
        
        nx = norm(x, axis=1)[:,None]
        nw = norm(w, axis=0)[None,:]
        y = xp.dot(x, w) / (nx * nw)
        return y,
    
    def backward(self, inputs, grad_outputs):
        xp = chainer.cuda.get_array_module(*inputs)
        norm = xp.linalg.norm
        x, w = inputs
        gy, = grad_outputs
        if gy is None:
            gy = 1.0
        nx = norm(x, axis=1)[:,None]
        nw = norm(w, axis=0)[None,:]
        xtw = xp.dot(x, w)
        
        xtx = nx ** 2
        wtw = nw ** 2
        
        inv = gy / xp.sqrt( xtx * wtw - xtw**2 )
        gx = inv.dot( (w - x.T.dot(xtw / xtx)).T )
        gw = (x.T - w.dot((xtw / wtw).T) ).dot(inv)
        return gx, gw

def angle_function(x, w):
    return AngleFunction()(x, w)

In [6]:
class AngleLink(chainer.Link):
    def __init__(self, dim, n_filter):
        super(AngleLink, self).__init__()
        with self.init_scope():
            W = np.random.random( (dim, n_filter) ).astype(np.float32)
            self.W = chainer.Parameter(W)
    
    def __call__(self, x):
        return angle_function(x, self.W)

In [13]:
def projection_onto_hypersphere(update_rule, w):
    xp = chainer.cuda.get_array_module(w)
    nw = xp.linalg.norm(w.data, axis=0)[None,:]
    w.data /= nw

In [19]:
N = 5
d = 1000
M = 4

norm = np.linalg.norm
random = np.random.random
x = random( (N, d) ).astype(np.float32)
x = chainer.Variable(x)

In [20]:
l1 = AngleLink(d, M)
optimizer = chainer.optimizers.SGD()
optimizer.setup(l1)

In [21]:
l1.W.update_rule.add_hook(projection_onto_hypersphere)

In [22]:
N = 10000
L = np.zeros(N)
for n in range(N):
    z = l1(x)
    l = F.sum(z**2)
    l.cleargrad()
    l.backward()
    optimizer.update()
    L[n] = l.data

In [23]:
plt.plot(L)
plt.grid()
plt.xlabel('Iter')
plt.ylabel('Loss')


Out[23]:
<matplotlib.text.Text at 0x7ff12f666d30>

In [50]:
np.linalg.norm(l1.W.grad, axis=0)


Out[50]:
array([ 537.37634277,  537.56158447,  538.81158447,  537.99725342], dtype=float32)