In [ ]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import theano
import theano.tensor as T

In [ ]:
def sgd(cost, params, learning_rate=0.05):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for param, grad in zip(params, grads):
        updates.append((param, param - learning_rate * grad))
    return updates

def rmsprop(cost, params, learning_rate=1e-3, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for param, grad in zip(params, grads):
        grad_avg = theano.shared(param.get_value() * 0.)
        grad_avg_new = rho * grad_avg + (1 - rho) * grad**2

        grad_scaling = T.sqrt(grad_avg_new + epsilon)
        grad = grad / grad_scaling

        updates.append((grad_avg, grad_avg_new))
        updates.append((param, param - learning_rate * grad))
    return updates

def adagrad(cost, params, learning_rate=1e-3, epsilon=1e-8):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for param, grad in zip(params, grads):
        grad_square = theano.shared(param.get_value() * 0.)
        grad_square_new = grad_square + grad**2

        updates.append((grad_square, grad_square_new))
        updates.append((
                param,
                param - learning_rate * grad / T.sqrt(grad_square_new + epsilon)
        ))
    return updates

def adam2(cost, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-3):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    
    t = theano.shared(0)
    t_new = t + 1
    updates.append((t, t_new))
    
    for param, grad in zip(params, grads):
        first_moment = theano.shared(param.get_value() * 0.)
        first_moment_new = beta1 * first_moment + (1 - beta1) * grad
        first_moment_new = first_moment_new / (1 - beta1**t_new)

        second_moment = theano.shared(param.get_value() * 0.)
        second_moment_new = beta2 * second_moment + (1 - beta2) * grad**2
        second_moment_new = second_moment_new / (1 - beta2**t_new)

        updates.append((first_moment, first_moment_new))
        updates.append((second_moment, second_moment_new))
        updates.append((
            param,
            param - learning_rate * first_moment_new / T.sqrt(second_moment_new + epsilon)
        ))
    return updates

def adam(cost, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    
    t = theano.shared(0)
    t_new = t + 1
    updates.append((t, t_new))
    
    for param, grad in zip(params, grads):
        first_moment = theano.shared(param.get_value() * 0.)
        first_moment_new = beta1 * first_moment + (1 - beta1) * grad

        second_moment = theano.shared(param.get_value() * 0.)
        second_moment_new = beta2 * second_moment + (1 - beta2) * grad**2

        learning_rate_norm = learning_rate * T.sqrt(1 - beta2**t_new) / (1 - beta1**t_new)

        updates.append((first_moment, first_moment_new))
        updates.append((second_moment, second_moment_new))
        updates.append((
            param,
            param - learning_rate_norm * first_moment_new / T.sqrt(second_moment_new + epsilon)
        ))
    return updates

In [ ]:
# INITIALIZATION
# naive guess of the minimum location
x = theano.shared(0.5, name='x')
y = theano.shared(-3., name='y')

In [ ]:
# COST FUNCTION
# pick quadratic for simple example/debugging,
# Rosen's function for more interesting convergence analysis

# global minimum = (0, 0)
quad = x**2 + y**2

# global minimum = (1, 1)
rosen = (1 - x)**2 + 100. * (y - x**2)**2

# pick the cost function.
cost = rosen
f_cost = theano.function([], cost)

In [ ]:
# OPTIMIZER
# pick the optimization method and some meta parameters

eta = 0.05
#updates = sgd(cost, [x, y], learning_rate=eta)
#updates = rmsprop(cost, [x, y], learning_rate=eta)
#updates = adagrad(cost, [x, y], learning_rate=eta)
updates = adam(cost, [x, y], learning_rate=eta)

train = theano.function(
    inputs=[],
    outputs=cost,
    updates=updates 
)

In [ ]:
# RUN OPTIMIZATION

STR_XY = "iter: {0:2d}, cost = {1:.2f}, (x, y) = ({2:5.2f}, {3:5.2f})"
costs = [float(f_cost())]
xs = [float(x.get_value())]
ys = [float(y.get_value())]

num_epochs = 5000
for epoch in range(num_epochs):
    train()
    costs.append(float(f_cost()))
    xs.append(float(x.get_value()))
    ys.append(float(y.get_value()))
    if epoch % (num_epochs / 10) == 0:
        print STR_XY.format(epoch, costs[-1], xs[-1], ys[-1])
print STR_XY.format(epoch, costs[-1], xs[-1], ys[-1])

In [ ]:
# PREPARING COST FUNCTION'S LEVEL CURVES
x_grid = np.arange(-5.0, 5.0, 0.05)
y_grid = np.arange(-5.0, 5.0, 0.05)
X, Y = np.meshgrid(x_grid, y_grid)
#Z = X**2 + Y**2
Z = (1 - X)**2 + 100. * (Y - X**2)**2

In [ ]:
# PLOT
# plot the evolution of the minimum estimate
# together with the functions level curves

fig = plt.figure()
plt.axes(xlim=(-1.5, 1.5), ylim=(-4, 4))

plt.contour(X, Y, Z,
           colors='lightgray', levels=np.logspace(-1, 4, 8))
plt.plot(xs, ys, '-')

plt.show()

In [ ]:
# ANIMATE
# do a looping pop-up animation of the selected algorithm

fig = plt.figure()
ax = plt.axes(xlim=(-1.5, 1.5), ylim=(-4, 4))
line, = ax.plot([], [], '-')

def init():
    line.set_data([], [])
    return line,

# animation function. This is called sequentially
def animate(i):
    line.set_data(xs[:i], ys[:i])
    return line,

ax.contour(X, Y, Z,
           colors='lightgray', levels=np.logspace(-1, 4, 8))
ax.plot()

# frames is the number of steps + 1 (initial position)
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=len(xs), interval=50, blit=True)

plt.show()

In [ ]: