In [1]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Date: 06/08/2020
This is a self-contained notebook that reproduces the toy example in Fig. 1 of the Guided ES paper.
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
print(f'TensorFlow version: {tf.__version__}')
In [3]:
# Generate problem data.
rs = np.random.RandomState(seed=0)
m = 2000 # Number of data points.
n = 1000 # Number of variables.
A = rs.randn(m, n)
b = rs.randn(m, 1)
xstar = np.linalg.lstsq(A, b, rcond=None)[0]
f_star = (0.5/float(m)) * np.linalg.norm(np.dot(A, xstar) - b) ** 2
A = tf.convert_to_tensor(A, dtype=tf.float32)
b = tf.convert_to_tensor(b, dtype=tf.float32)
# This is a bias vector that will be added to the gradient
grad_bias = 1.0 * tf.nn.l2_normalize(tf.convert_to_tensor(rs.randn(n, 1), dtype=tf.float32))
@tf.function
def loss_and_grad_fun(x):
residual = tf.matmul(A, x) - b
loss = 0.5 * tf.norm(residual) ** 2 / float(m)
# The 'gradient' that we observe is a noisy, biased version of the true gradient.
# This is meant to mimic scenarios where we only have access to biased gradients.
err = tf.matmul(tf.transpose(A), residual) / float(m)
grad_noise = 1.5 * tf.nn.l2_normalize(tf.random.normal(shape=(n, 1)))
gradient = err + (grad_bias + grad_noise) * tf.norm(err)
return loss, gradient
In [4]:
opt = tf.keras.optimizers.SGD(5e-3)
@tf.function
def step_fun(x):
loss, gradient = loss_and_grad_fun(x)
opt.apply_gradients([(gradient, x)])
return loss
In [5]:
%%time
x = tf.Variable(tf.zeros((n, 1)), dtype=tf.float32)
fobj = []
for _ in range(10000):
fobj.append(step_fun(x))
# Store training curve for plotting later.
f_gd = tf.stack(fobj).numpy().copy()
In [6]:
# Hyperparameters for Vanilla ES
sigma = 0.1
beta = 1.0
learning_rate = 0.2
# Defines the distribution for sampling parameter perturbations.
scale = sigma / np.sqrt(n)
def sample():
return scale * tf.random.normal(shape=(n, 1), dtype=tf.float32)
opt = tf.keras.optimizers.SGD(learning_rate)
@tf.function
def step_fun(x):
epsilon = sample()
# We utilize antithetic (positive and negative) samples.
f_pos, _ = loss_and_grad_fun(x + epsilon)
f_neg, _ = loss_and_grad_fun(x - epsilon)
# This update is a stochastic finite difference estimate of the true gradient.
update = (beta / (2 * sigma ** 2)) * (f_pos - f_neg) * epsilon
opt.apply_gradients([(update, x)])
return loss_and_grad_fun(x)[0]
In [7]:
%%time
x = tf.Variable(tf.zeros((n, 1)), dtype=tf.float32)
# Run the optmizer.
fobj = []
for _ in range(10000):
fobj.append(step_fun(x))
# Store training curve for plotting later.
f_ves = tf.stack(fobj).numpy().copy()
Guided ES is our proposed method. It uses a diagonal plus low-rank covariance matrix for drawing perturbations, where the low-rank subspace is spanned by the available gradient information.
This allows it to incorporate the biased gradient information, while still minimizing the true loss function.
In [8]:
# Hyperparameters for Guided ES
sigma = 0.1
alpha = 0.5
beta = 1.0
k = 1 # Defines the dimensionality of the low-rank subspace.
# Defines parameters of the distribution for sampling perturbations.
a = sigma * np.sqrt(alpha / float(n))
c = sigma * np.sqrt((1. - alpha) / float(k))
def sample(gradient_subspace):
epsilon_full = tf.random.normal(shape=(n, 1), dtype=tf.float32)
epsilon_subspace = tf.random.normal(shape=(k, 1), dtype=tf.float32)
Q, _ = tf.linalg.qr(gradient_subspace)
epsilon = a * epsilon_full + c * tf.matmul(Q, epsilon_subspace)
return epsilon
opt = tf.keras.optimizers.SGD(0.2)
@tf.function
def step_fun(x):
# We pass the gradient to our sampling function.
loss, gradient = loss_and_grad_fun(x)
epsilon = sample(gradient)
# We utilize antithetic (positive and negative) samples.
f_pos, _ = loss_and_grad_fun(x + epsilon)
f_neg, _ = loss_and_grad_fun(x - epsilon)
# This update is a stochastic finite difference estimate of the true gradient.
update = (beta / (2 * sigma ** 2)) * (f_pos - f_neg) * epsilon
opt.apply_gradients([(update, x)])
return loss_and_grad_fun(x)[0]
In [9]:
%%time
x = tf.Variable(tf.zeros((n, 1)), dtype=tf.float32)
# Run the optmizer.
fobj = []
for _ in range(10000):
fobj.append(step_fun(x))
# Store training curve for plotting later.
f_ges = tf.stack(fobj).numpy().copy()
In [10]:
COLORS = {'ges': '#7570b3', 'ves': '#1b9e77', 'sgdm': '#d95f02'}
plt.figure(figsize=(8, 6))
plt.plot(f_ves - f_star, color=COLORS['ves'], label='Vanilla ES')
plt.plot(f_gd - f_star, color=COLORS['sgdm'], label='Grad. Descent')
plt.plot(f_ges - f_star, color=COLORS['ges'], label='Guided ES')
plt.legend(fontsize=16, loc=0)
plt.xlabel('Iteration', fontsize=16)
plt.ylabel('Loss', fontsize=16)
plt.title('Demo of Guided Evolutionary Strategies', fontsize=16);
In [ ]: