In [ ]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import theano
import theano.tensor as T
In [ ]:
def sgd(cost, params, learning_rate=0.05):
grads = T.grad(cost=cost, wrt=params)
updates = []
for param, grad in zip(params, grads):
updates.append((param, param - learning_rate * grad))
return updates
def rmsprop(cost, params, learning_rate=1e-3, rho=0.9, epsilon=1e-6):
grads = T.grad(cost=cost, wrt=params)
updates = []
for param, grad in zip(params, grads):
grad_avg = theano.shared(param.get_value() * 0.)
grad_avg_new = rho * grad_avg + (1 - rho) * grad**2
grad_scaling = T.sqrt(grad_avg_new + epsilon)
grad = grad / grad_scaling
updates.append((grad_avg, grad_avg_new))
updates.append((param, param - learning_rate * grad))
return updates
def adagrad(cost, params, learning_rate=1e-3, epsilon=1e-8):
grads = T.grad(cost=cost, wrt=params)
updates = []
for param, grad in zip(params, grads):
grad_square = theano.shared(param.get_value() * 0.)
grad_square_new = grad_square + grad**2
updates.append((grad_square, grad_square_new))
updates.append((
param,
param - learning_rate * grad / T.sqrt(grad_square_new + epsilon)
))
return updates
def adam2(cost, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-3):
grads = T.grad(cost=cost, wrt=params)
updates = []
t = theano.shared(0)
t_new = t + 1
updates.append((t, t_new))
for param, grad in zip(params, grads):
first_moment = theano.shared(param.get_value() * 0.)
first_moment_new = beta1 * first_moment + (1 - beta1) * grad
first_moment_new = first_moment_new / (1 - beta1**t_new)
second_moment = theano.shared(param.get_value() * 0.)
second_moment_new = beta2 * second_moment + (1 - beta2) * grad**2
second_moment_new = second_moment_new / (1 - beta2**t_new)
updates.append((first_moment, first_moment_new))
updates.append((second_moment, second_moment_new))
updates.append((
param,
param - learning_rate * first_moment_new / T.sqrt(second_moment_new + epsilon)
))
return updates
def adam(cost, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8):
grads = T.grad(cost=cost, wrt=params)
updates = []
t = theano.shared(0)
t_new = t + 1
updates.append((t, t_new))
for param, grad in zip(params, grads):
first_moment = theano.shared(param.get_value() * 0.)
first_moment_new = beta1 * first_moment + (1 - beta1) * grad
second_moment = theano.shared(param.get_value() * 0.)
second_moment_new = beta2 * second_moment + (1 - beta2) * grad**2
learning_rate_norm = learning_rate * T.sqrt(1 - beta2**t_new) / (1 - beta1**t_new)
updates.append((first_moment, first_moment_new))
updates.append((second_moment, second_moment_new))
updates.append((
param,
param - learning_rate_norm * first_moment_new / T.sqrt(second_moment_new + epsilon)
))
return updates
In [ ]:
# INITIALIZATION
# naive guess of the minimum location
x = theano.shared(0.5, name='x')
y = theano.shared(-3., name='y')
In [ ]:
# COST FUNCTION
# pick quadratic for simple example/debugging,
# Rosen's function for more interesting convergence analysis
# global minimum = (0, 0)
quad = x**2 + y**2
# global minimum = (1, 1)
rosen = (1 - x)**2 + 100. * (y - x**2)**2
# pick the cost function.
cost = rosen
f_cost = theano.function([], cost)
In [ ]:
# OPTIMIZER
# pick the optimization method and some meta parameters
eta = 0.05
#updates = sgd(cost, [x, y], learning_rate=eta)
#updates = rmsprop(cost, [x, y], learning_rate=eta)
#updates = adagrad(cost, [x, y], learning_rate=eta)
updates = adam(cost, [x, y], learning_rate=eta)
train = theano.function(
inputs=[],
outputs=cost,
updates=updates
)
In [ ]:
# RUN OPTIMIZATION
STR_XY = "iter: {0:2d}, cost = {1:.2f}, (x, y) = ({2:5.2f}, {3:5.2f})"
costs = [float(f_cost())]
xs = [float(x.get_value())]
ys = [float(y.get_value())]
num_epochs = 5000
for epoch in range(num_epochs):
train()
costs.append(float(f_cost()))
xs.append(float(x.get_value()))
ys.append(float(y.get_value()))
if epoch % (num_epochs / 10) == 0:
print STR_XY.format(epoch, costs[-1], xs[-1], ys[-1])
print STR_XY.format(epoch, costs[-1], xs[-1], ys[-1])
In [ ]:
# PREPARING COST FUNCTION'S LEVEL CURVES
x_grid = np.arange(-5.0, 5.0, 0.05)
y_grid = np.arange(-5.0, 5.0, 0.05)
X, Y = np.meshgrid(x_grid, y_grid)
#Z = X**2 + Y**2
Z = (1 - X)**2 + 100. * (Y - X**2)**2
In [ ]:
# PLOT
# plot the evolution of the minimum estimate
# together with the functions level curves
fig = plt.figure()
plt.axes(xlim=(-1.5, 1.5), ylim=(-4, 4))
plt.contour(X, Y, Z,
colors='lightgray', levels=np.logspace(-1, 4, 8))
plt.plot(xs, ys, '-')
plt.show()
In [ ]:
# ANIMATE
# do a looping pop-up animation of the selected algorithm
fig = plt.figure()
ax = plt.axes(xlim=(-1.5, 1.5), ylim=(-4, 4))
line, = ax.plot([], [], '-')
def init():
line.set_data([], [])
return line,
# animation function. This is called sequentially
def animate(i):
line.set_data(xs[:i], ys[:i])
return line,
ax.contour(X, Y, Z,
colors='lightgray', levels=np.logspace(-1, 4, 8))
ax.plot()
# frames is the number of steps + 1 (initial position)
anim = animation.FuncAnimation(fig, animate, init_func=init,
frames=len(xs), interval=50, blit=True)
plt.show()
In [ ]: