Logarithmic Parameters

This notebook explores Bayesian optimisation of a function who's parameter is best thought of logarithmically (the order of magnitude is more important than the value itself)

To accommodate this, the surrogate function is trained on the exponents of the values rather than the values themselves

note: for this particular function, a $\nu=2.5$ works better for the Matern kernel than $\nu=1.5$.


In [ ]:
%load_ext autoreload
%autoreload 2

In [ ]:
from IPython.core.debugger import Tracer # debugging
from IPython.display import clear_output, display
import time

%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() # prettify matplotlib

import copy
import numpy as np
import sklearn.gaussian_process as sk_gp

In [ ]:
# local modules
import turbo as tb
import turbo.modules as tm
import turbo.gui.jupyter as tg
import turbo.plotting as tp

In [ ]:
# make deterministic
np.random.seed(100)

Function to optimize:


In [ ]:
buffer = 5e-3 # function not defined at exactly 0
shift = -2
def f(x):
    x = x - shift
    return np.cos(2*(20-x)**2)/x - 2*np.log(x)
def logspace(from_, to, num_per_mag=1):
    '''
    num_per_mag: number of samples per order of magnitude
    '''
    from_exp = np.log10(from_)
    to_exp = np.log10(to)
    num = abs(to_exp-from_exp)*num_per_mag + 1
    return np.logspace(from_exp, to_exp, num=num, base=10)

x_min = buffer
x_max = 5
xs = logspace(x_min, x_max, num_per_mag=200)
x_min += shift
x_max += shift
xs += shift


#xs = np.linspace(x_min, x_max, num=601)
print(len(xs))
ys = f(xs)
best_y = np.max(ys)

In [ ]:
plt.figure(figsize=(16,4))
plt.plot(xs, ys, 'g-')
plt.margins(0.01, 0.1)
plt.title('Linear Scale')
plt.xlabel('x')
plt.ylabel('cost')
plt.show()

In [ ]:
plt.figure(figsize=(16,4))
plt.plot(xs - shift, ys, 'g-') # have to revert the shift to plot with the log scale
plt.margins(0.1, 0.1)
plt.title('Logarithmic Scale')
plt.xlabel('x')
plt.axes().set_xscale('log')
plt.ylabel('cost')
plt.show()

In [ ]:
bounds = [('x', x_min, x_max)]
op = tb.Optimiser(f, 'max', bounds, pre_phase_trials=2, settings_preset='default')
'''
op.latent_space = tm.NoLatentSpace()
# this function is very difficult to fit effectively, I found that the only way to make the GP behave is
# to use the domain knowledge that the length_scale can't be anywhere near the default maximum of 100,000
op.surrogate_factory = tm.SciKitGPSurrogate.Factory(gp_params=dict(
    alpha = 1e-10, # larger => more noise. Default = 1e-10
    kernel = 1.0 * gp.kernels.Matern(nu=2.5, length_scale_bounds=(1e-5, 10))+gp.kernels.WhiteKernel(),
), variable_iterations=lambda trial_num: 4 if (trial_num-2) % 3 == 0 else 1)
'''
op.surrogate = tm.GPySurrogate()
op.acquisition = tm.UCB(beta=2)

op_log = copy.deepcopy(op)

rec = tb.Recorder(op)

To illustrate the problem


In [ ]:
tg.OptimiserProgressBar(op)
op.run(max_trials=30)

In [ ]:
tp.plot_error(rec, true_best=best_y);

In [ ]:
tp.plot_timings(rec);

In [ ]:
tp.interactive_plot_trial_1D(rec, param='x', true_objective=f)

Now with a Logarithmic latent space mapping


In [ ]:
zero_point = x_min - buffer # the function is not defined for any x <= zero point
op_log.latent_space = tm.ConstantLatentSpace(mappings={'x' : tm.LogMap(zero_point=zero_point)})
rec_log = tb.Recorder(op_log)

In [ ]:
tg.OptimiserProgressBar(op_log)
op_log.run(max_trials=15)

In [ ]:
tp.plot_error(rec_log, true_best=best_y);

In [ ]:
tp.plot_timings(rec_log);

In [ ]:
for l in [False, True]:
    tp.plot_trial_1D(rec_log, param='x', trial_num=-1, true_objective=f, plot_in_latent_space=l)

In [ ]:
tp.interactive_plot_trial_1D(rec_log, true_objective=f)

In [ ]: