In [8]:
import keras
import utils
reload(utils)
from utils import *
%pylab inline
# Get the input data (All the zeros and ones in the dataset)
(x, y), (x_, y_) = keras.datasets.mnist.load_data()
X = x[np.where(y<=1)]
Y = y[np.where(y<=1)]
Y = np.array(Y, dtype='int')
# Reshape the images to vectors
X = X.reshape(X.shape[0], -1)
X = X / 255. # Normalize inputs
# Vizualize the digits
pylab.rcParams['figure.figsize'] = (15, 4)
for i in range(12):
plt.subplot(2, 6, i+1)
plt.imshow(X[i].reshape([28, 28]))
plt.show()
We use a logistic regression, (You may want to read this : http://cs229.stanford.edu/notes/cs229-notes1.pdf) :
The Cost is a function of the true output $Y$ and the prediction $p$, which itself is a function of a linear activation $s(x)$
To use gradient descent, we have to compute the gradient of the cost with respect to w :
$ \frac{dC}{dW} $
We take adventage of the chain rule :
$ \frac{dC}{dW} = \frac{dC}{dp} \cdot \frac{dp}{ds} \cdot \frac{ds}{dw} $
We derive each terms : \begin{align} \frac{dC}{dp} &= - \frac{y}{p} - (-1) \cdot \frac{1-y}{1-p} \\ &= - \frac{y}{p} + \frac{1-y}{1-p} \\ &= \frac{-y + y.p + p - y.p}{p \cdot (1-p)} \\ &= \frac{-y+p}{p \cdot (1-p)} \end{align}
All together, we have : \begin{align} \frac{dC}{dW} &= \frac{dC}{dp} \cdot \frac{dp}{ds} \cdot \frac{ds}{dw} \\ &= \frac{-y+p}{p \cdot (1-p)} \cdot p \cdot (1-p) \cdot x \\ &= (-y+p) \cdot x \\ &= (p-y) \cdot x \end{align}
In [ ]:
# Set-up the weights
W = np.random.random((784,))-.5
# Train
for _ in range(2):
acc = []
losses = []
for x,y in zip(X, Y):
pred = linear(x, W)
pred = sigmoid(pred)
acc.append(round(pred)==y)
loss = nll(pred, y)
losses.append(loss)
update = (pred - y) * x
W = W - .02 * update
print sum(acc) / float(len(acc)), sum(losses)/len(losses)
In [ ]:
gen = batch_generator(1)
valid_gen = batch_generator(100)
X_valid, Y_valid = valid_gen.next()
W = np.random.normal(size=IMG_SIZE * IMG_SIZE)
b = np.random.normal()
log = lambda x: np.log(x + 1e-8)
exp = lambda x: np.exp(x + 1e-8)
alph_ = 1.6732632423543772848170429916717
lambd_ = 1.0507009873554804934193349852946
linear = lambda x: np.dot(W.T, x) + b
sigm = lambda x: 1 / (1 + exp(-x))
elu = lambda x, alpha: np.maximum(x, alpha * (exp(x) - 1))
selu = lambda x: lambd_ * elu(x, alph_)
nll = lambda p, y: - y * log(p) - (1 - y) * log(1 - p)
In [4]:
def prob(X):
return sigm(linear(X))
def loss(X, y):
# loss = - y .ln( sigm(WT.X+b))
# -(1-y).ln(1-sigm(WT.X+b))
p = prob(X)
return nll(p, y)
def gradient_loss(X, y):
# d.loss / d.W = (p-y).X
p = prob(X)
return ((p - y) * X)
def evaluate():
probs = np.array(map(prob, X_valid))
loss = nll(probs, Y_valid)
loss = loss.mean()
probs = map(round, probs)
accuracy = sum(probs == Y_valid)
return accuracy, loss
losses = []
alpha = 0.001
for epoch in range(60):
_loss = 0
alpha *= 0.95
for _ in range(2000):
X, Y = gen.next()
X, Y = X[0], Y[0]
_loss += loss(X, Y)
W = W - alpha * gradient_loss(X, Y)
losses.append(_loss / 2000)
print epoch, losses[-1], evaluate(), alpha
In [5]:
plt.plot(losses)
plt.show()
In [6]:
def prob(X):
return sigm(selu(linear(X)))
def loss(X, y):
# loss = - y .ln( sigm(WT.X+b))
# -(1-y).ln(1-sigm(WT.X+b))
p = prob(X)
return nll(p, y)
def gradient_loss(X, y):
# d.loss / d.W = (p-y).X
p = prob(X)
if linear(X) <= 0:
return X * (p - y) * (p + lambd_ * lambd_)
else:
return X * (p - y) * lambd_
def evaluate():
probs = np.array(map(prob, X_valid))
loss = nll(probs, Y_valid)
loss = loss.mean()
probs = map(round, probs)
accuracy = sum(probs == Y_valid)
return accuracy, loss
losses = []
alpha = 0.001
for epoch in range(30):
_loss = 0
alpha *= 0.95
for _ in range(2000):
X, Y = gen.next()
X, Y = X[0], Y[0]
_loss += loss(X, Y)
W = W - alpha * gradient_loss(X, Y)
losses.append(_loss / 2000)
print epoch, losses[-1], evaluate(), alpha
In [7]:
plt.plot(losses)
plt.show()