In [11]:
%matplotlib inline
import numpy.random as rng
import pylab as pl
import autograd.numpy as np
from autograd import grad
In [12]:
Npats, Nins = 60, 2
X = 1*rng.normal(1,1,size=(Npats,Nins))
w_truth = rng.normal(0,1,size=(Nins))
m_truth = rng.normal(1,1,size=(Nins))
phi = np.dot(X - m_truth, w_truth)
Targ = np.where(phi > 0.0, 1, 0)
In [13]:
def sigmoid(phi):
return 1.0/(1.0 + np.exp(-phi))
def calc_prob_class1(params):
# Sigmoid perceptron ('logistic regression')
tildex = X - params['mean']
W = params['wgts']
phi = np.dot(tildex, W)
return sigmoid(phi) # Sigmoid perceptron ('logistic regression')
def calc_membership(params):
# NB. this is just a helper function for training_loss really.
tildex = X - params['mean']
W, r2, R2 = params['wgts'], params['r2'], params['R2']
Dr2 = np.power(np.dot(tildex, W), 2.0)
L2X = (np.power(tildex, 2.0)).sum(1)
DR2 = L2X - Dr2
dist2 = (Dr2/r2) + (DR2/R2) # rescaled 'distance' to the shifted 'origin'
membership = np.exp(-0.5*dist2)
#print(membership)
return np.array(membership)
def classification_loss(params):
membership = calc_membership(params)
Y = calc_prob_class1(params)
return np.sum(membership*(Targ*np.log(Y) + (1-Targ)*np.log(1-Y)))
In [14]:
def MoG_loss(params):
membership = calc_membership(params)
return np.sum(membership)
In [15]:
classification_gradient = grad(classification_loss)
MoG_gradient = grad(MoG_loss)
Red and Black are target 0 and 1 patterns respectively.
They will get "filled in" once the perceptron is getting them correct.
In [16]:
# Be able to show the current solution, against the data in 2D.
def show_result(params, X, Targ):
print("Parameters:")
for key in params.keys():
print(key,'\t', params[key])
print("Loss:", training_loss(params))
membership = calc_membership(params)
Y = calc_prob_class1(params)
pl.clf()
marksize = 8
cl ={0:'red', 1:'black'}
for i, x in enumerate(X):
pl.plot(x[0],x[1],'x',color=cl[int(Targ[i])],alpha=.4,markersize=marksize)
pl.plot(x[0],x[1],'o',color=cl[int(Targ[i])],alpha=1.-float(abs(Targ[i]-Y[i])),markersize=marksize)
pl.axis('equal')
s = X.ravel().max() - X.ravel().min()
m, w = params['mean'], params['wgts']
# Show the mean in blue
#pl.arrow(0, 0, m[0], m[1], head_width=0.25, head_length=0.5, fc='b', ec='b', linewidth=1, alpha=.95)
# Show the perceptron decision boundary, in green
pl.arrow(m[0]-w[0], m[1]-w[1], w[0], w[1], head_width=s, head_length=s/5, fc='g', ec='g', linewidth=3, alpha=.5)
pl.show()
In [17]:
def do_one_learning_step(params,X,Targ,rate):
grads = classification_gradient(params)
params['wgts'] = params['wgts'] + rate * grads['wgts'] # one step of learning
params['mean'] = params['mean'] + rate * grads['mean'] # one step of learning
return (params)
In [18]:
init_w = rng.normal(0,1,size=(Nins))
init_m = 4.*rng.normal(0,1,size=(Nins))
rate = 0.5 / Npats
params = {'wgts':init_w, 'mean':init_m, 'r2':1000.0, 'R2':1000.0}
In [19]:
for t in range(250):
params = do_one_learning_step(params,X,Targ,rate)
show_result(params, X, Targ)
Y = sigmoid(np.dot(X-params['mean'], params['wgts']))
print('vanilla loss: ', np.sum(Targ*np.log(Y) + (1-Targ)*np.log(1-Y)))
My model above used the loss function $$ \mathcal{L} = \sum_n \rho_{n} \;\, \log P(\mathbf{x}_n \; \text{classified correctly}) $$ where $\rho_n$ is the "membership". I set $\rho_n = \exp(-d^2_n/2)$ where $d_{n}$ is the Mahalanobis distance of the n-th input $\mathbf{x}_n$ under the current Gaussian.
Suppose this actually works and we have a bunch of such "boundary hunters".
They're each capable of independently doing something sensible, unlike vanilla perceptrons. But we do want them to interact just enough to stay out of each others' way. This makes me wonder whether using MoG to initialise them might be a good idea, and THAT makes me wonder whether we can just interpolate smoothly between these two loss functions. ie. is there a "knob" we can turn, which when set at one extreme gives the MoG and at the other gives a set of class boundaries? It would probably be a cute thing to be able to do, basically.
The loss of a MoG model is a sum over training items, which are indexed by $n$.
For simplicity, let's suppose the $K$ mixture components all have the same prior $\pi_k = 1/K$, and same "volume" (determinant of the inverse of the covariance matrix, or somesuch).
Then the probability density $\rho_{nk} = P(\mathbf{x}_n | k) \propto \exp(-d_{nk}^2 / 2)$.
We can denote a "responsibility" (posterior probability) of the k-th Gaussian for the n-th input pattern $$ r_{nk} = \frac{\rho_{nk}}{\displaystyle{ \sum_{k^\prime} \rho_{nk^\prime}}}$$.
Per item, the loss of a MoG model is then $$ \mathcal{L}_\text{MoG} = \sum_k r_{nk} \;\, \log P(\mathbf{x}_n \; \text{generated} | k) $$
I suspect a plausible loss function to use for the "hoarde of boundary hunters" could be (per item) as follows, which is very very super similar: $$ \mathcal{L}_\text{HoBH} = \sum_k r_{nk} \;\, \log P(\mathbf{x}_n \; \text{classified correctly} | k) $$
If so, that's quite groovy as a generative model: the universe has to (a) place the point correctly, and (b) give it the right class to nail the whole problem. ie. we could have a combined model with just uses the sum of the two losses, say, to model both the density and the class information.
In [ ]: