Copyright 2019 Google LLC
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
This is an implementation of the Progressive Dynamic Hurdles (PDH) algorithm that was presented in the Evolved Transformer paper. A toy problem is used to compare the algorithm to plain evolution.
This is a toy problem meant to simulate the architecture search problem presented in the paper. It is adapted from the toy problem presented by Real et al. (2018) with code based off of
A key difference here is that there is not a single function used to compute all fitnesses. Instead there are two fitness functions, depending on how many steps a Model is allowed to "train" for:
1) get_final_accuracy(): simulates the fitness computation of a model that has been trained for a hypothetical maximum number of train steps.
2) get_early_accuracy(): simulates the fitness computation of a model that has been trained for 1/100th the hypothetical maximum number of train steps.
Having two different fitnesses for two different number of train steps is necessary to apply the PDH algorithm. In line with this, each Model also has two accuracies: an observed_accuracy, which is what we observe when the model is evaluated during the search, and a true_accuracy, which is the accuracy the model achieves when it is trained for the maximum number of train steps.
In [0]:
DIM = 100 # Number of bits in the bit strings (i.e. the "models").
NOISE_STDEV = 0.01 # Standard deviation of the simulated training noise.
EARLY_SIGNAL_NOISE = 0.005 # Standard deviation of the noise added to earlier
# observations.
REDUCTION_FACTOR = 100.0 # The factor by which the number of train steps is
# reduced for earlier observations.
class Model(object):
"""A class representing a model.
Attributes:
arch: the architecture as an int representing a bit-string of length `DIM`.
As a result, the integers are required to be less than `2**DIM`.
observed_accuracy: the simulated validation accuracy observed for the model
during the search. This may be either the accuracy after training for
the maximum number of steps or the accuracy after training for 1/100 the
maximum number of steps.
true_accuracy: the simulated validation accuracy after the maximum train
steps.
"""
def __init__(self):
self.arch = None
self.observed_accuracy = None
self.true_accuracy = None
def get_final_accuracy(arch):
"""Simulates training for the maximum number of steps and then evaluating.
Args:
arch: the architecture as an int representing a bit-string.
"""
accuracy = float(_sum_bits(arch)) / float(DIM)
accuracy += random.gauss(mu=0.0, sigma=NOISE_STDEV)
accuracy = 0.0 if accuracy < 0.0 else accuracy
accuracy = 1.0 if accuracy > 1.0 else accuracy
return accuracy
def get_early_accuracy(final_accuracy):
"""Simulates training for 1/100 the maximum steps and then evaluating.
Args:
final_accuracy: the accuracy of the model if trained for the maximum number
of steps.
"""
observed_accuracy = final_accuracy/REDUCTION_FACTOR + random.gauss(mu=0,
sigma=EARLY_SIGNAL_NOISE)
observed_accuracy = 0.0 if observed_accuracy < 0.0 else observed_accuracy
observed_accuracy = 1.0 if observed_accuracy > 1.0 else observed_accuracy
return observed_accuracy
def _sum_bits(arch):
"""Returns the number of 1s in the bit string.
Args:
arch: an int representing the bit string.
"""
total = 0
for _ in range(DIM):
total += arch & 1
arch = (arch >> 1)
return total
In [0]:
import random
def random_architecture():
"""Returns a random architecture (bit-string) represented as an int."""
return random.randint(0, 2**DIM - 1)
def mutate_arch(parent_arch):
"""Computes the architecture for a child of the given parent architecture.
Args:
parent_arch: an int representing the architecture (bit-string) of the
parent.
Returns:
An int representing the architecture (bit-string) of the child.
"""
position = random.randint(0, DIM - 1) # Index of the bit to flip.
# Flip the bit at position `position` in `child_arch`.
child_arch = parent_arch ^ (1 << position)
return child_arch
Here we implement both plain evolution and evolution + Progressive Dynamic Hurdles.
In [0]:
import collections
import random
import copy
def plain_evolution(cycles, population_size, sample_size, early_observation):
"""Plain evolution.
Args:
cycles: the number of cycles the search is run for.
population_size: the size of the population.
sample_size: the size of the sample for both parent selection and killing.
early_observation: boolean. Whether or not we are observing the models early
by evaluating them for 1/100th the maximum number of train steps.
"""
population = collections.deque()
history = [] # Not used by the algorithm, only used to report results.
# Initialize the population with random models.
while len(population) < population_size:
model = Model()
model.arch = random_architecture()
model.true_accuracy = get_final_accuracy(model.arch)
# If we are observing early, get the early accuracy that corresponds to the
# true_accuracy. Else, we are training each model for the maximum number of
# steps and so the observed_accuracy is the true_accuracy.
if early_observation:
model.observed_accuracy = get_early_accuracy(model.true_accuracy)
else:
model.observed_accuracy = model.true_accuracy
population.append(model)
history.append(model)
# Carry out evolution in cycles. Each cycle produces a model and removes
# another.
while len(history) < cycles:
# Sample randomly chosen models from the current population.
sample = random.sample(population, sample_size)
# The parent is the best model in the samples, according to their observed
# accuracy.
parent = max(sample, key=lambda i: i.observed_accuracy)
# Create the child model and store it.
child = Model()
child.arch = mutate_arch(parent.arch)
child.true_accuracy = get_final_accuracy(child.arch)
# If we are observing early, get the early accuracy that corresponds to the
# true_accuracy. Else, we are training each model for the maximum number of
# steps and so the observed_accuracy is the true_accuracy.
if early_observation:
child.observed_accuracy = get_early_accuracy(child.true_accuracy)
else:
child.observed_accuracy = child.true_accuracy
# Choose model to kill.
sample_indexes = random.sample(range(len(population)), sample_size)
min_fitness = float("inf")
kill_index = population_size
for sample_index in sample_indexes:
if population[sample_index].observed_accuracy < min_fitness:
min_fitness = population[sample_index].observed_accuracy
kill_index = sample_index
# Replace victim with child.
population[kill_index] = child
history.append(child)
return history, population
def pdh_evolution(train_resources, population_size, sample_size):
"""Evolution with PDH.
Args:
train_resources: the resources alotted for training. An early obsevation
costs 1, while a maximum train step observation costs 100.
population_size: the size of the population.
sample_size: the size of the sample for both parent selection and killing.
"""
population = collections.deque()
history = [] # Not used by the algorithm, only used to report results.
resources_used = 0 # The number of resource units used.
# Initialize the population with random models.
while len(population) < population_size:
model = Model()
model.arch = random_architecture()
model.true_accuracy = get_final_accuracy(model.arch)
# Always initialize with the early observation, since no hurdle has been
# established.
model.observed_accuracy = get_early_accuracy(model.true_accuracy)
population.append(model)
history.append(model)
# Since we are only performing an early observation, we are only consuming
# 1 resource unit.
resources_used += 1
# Carry out evolution in cycles. Each cycle produces a model and removes
# another.
hurdle = None
while resources_used < train_resources:
# Sample randomly chosen models from the current population.
sample = random.sample(population, sample_size)
# The parent is the best model in the sample, according to the observed
# accuracy.
parent = max(sample, key=lambda i: i.observed_accuracy)
# Create the child model and store it.
child = Model()
child.arch = mutate_arch(parent.arch)
child.true_accuracy = get_final_accuracy(child.arch)
# Once the hurdle has been established, a model is trained for the maximum
# amount of train steps if it overcomes the hurdle value. Otherwise, it
# only trains for the lesser amount of train steps.
if hurdle:
child.observed_accuracy = get_early_accuracy(child.true_accuracy)
# Performing the early observation costs 1 resource unit.
resources_used += 1
if child.observed_accuracy > hurdle:
child.observed_accuracy = child.true_accuracy
# Now that the model has trained longer, we consume additional
# resource units.
resources_used += REDUCTION_FACTOR - 1
else:
child.observed_accuracy = get_early_accuracy(child.true_accuracy)
# Since we are only performing an early observation, we are only consuming
# 1 resource unit.
resources_used += 1
# Choose model to kill.
sample_indexes = random.sample(range(len(population)), sample_size)
min_fitness = float("inf")
kill_index = population_size
for sample_index in sample_indexes:
if population[sample_index].observed_accuracy < min_fitness:
min_fitness = population[sample_index].observed_accuracy
kill_index = sample_index
# Replace victim with child.
population[kill_index] = child
history.append(child)
# Create a hurdle, splitting resources such that the number of models
# trained before and after the hurdle are approximately even. Here, our
# appoximation is assuming that every model after the hurdle trains for the
# maximum number of steps.
if not hurdle and resources_used >= int(train_resources/REDUCTION_FACTOR):
hurdle = 0
for model in population:
hurdle += model.observed_accuracy
hurdle /= len(population)
return history, population
In [0]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
TOTAL_RESOURCES = 10000 # Total number of resource units.
POPULATION_SIZE = 100 # The size of the population.
SAMPLE_SIZE = 10 # The size the subpopulation used for selecting parents and
# kill targets.
def graph_values(values, title, xlim, ylim):
plt.figure()
sns.set_style('white')
xvalues = range(len(values))
yvalues = values
ax = plt.gca()
dot_size = int(TOTAL_RESOURCES / xlim)
ax.scatter(
xvalues, yvalues, marker='.', facecolor=(0.0, 0.0, 0.0),
edgecolor=(0.0, 0.0, 0.0), linewidth=1, s=dot_size)
ax.xaxis.set_major_locator(ticker.LinearLocator(numticks=2))
ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
ax.yaxis.set_major_locator(ticker.LinearLocator(numticks=2))
ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
ax.set_title(title, fontsize=20)
fig = plt.gcf()
fig.set_size_inches(8, 6)
fig.tight_layout()
ax.tick_params(
axis='x', which='both', bottom=True, top=False, labelbottom=True,
labeltop=False, labelsize=14, pad=10)
ax.tick_params(
axis='y', which='both', left=True, right=False, labelleft=True,
labelright=False, labelsize=14, pad=5)
plt.xlabel('Number of Models Evaluated', labelpad=-16, fontsize=16)
plt.ylabel('Accuracy', labelpad=-30, fontsize=16)
plt.xlim(0, xlim + .05)
plt.ylim(0, ylim + .05)
sns.despine()
def graph_history(history):
observed_accuracies = [i.observed_accuracy for i in history]
true_accuracies = [i.true_accuracy for i in history]
graph_values(observed_accuracies, "Observed Accuracy",
xlim=len(history), ylim=max(observed_accuracies))
graph_values(true_accuracies, "True Accuracy",
xlim=len(history), ylim=max(true_accuracies))
Plain Evolution: Early Observartion
First, we run plain evolution with early observations. We get to observe many models (10K), but, while still useful, the accuracy signal is noisy, hurting the performance of the search.
In [9]:
history, _ = plain_evolution(
cycles=TOTAL_RESOURCES, population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE,
early_observation=True)
graph_history(history)
Plain Evolution: Maximum Step Observartion
Next, we run plain evolution with observations after each model has been trained for the maximum number of steps. The signal here is perfect, with the observed accuracy matching the true accuracy, but we see very few models since we are controlling for number of resources.
In [13]:
history, _ = plain_evolution(
cycles=TOTAL_RESOURCES/REDUCTION_FACTOR, population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE, early_observation=False)
graph_history(history)
Progressive Dynamic Hurdles
Finally, we run Progressive Dynamic Hurdles (PDH). By establishing a hurdle, PDH is able to utilize early observations to filter out flagrantly bad models, training only the most promising models for the high-cost maximum number of train steps. This perfect signal clarifies which of these promising models are truly the best and improves the search by generating candidates from the best parents.
In [15]:
history, _ = pdh_evolution(train_resources=TOTAL_RESOURCES,
population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE)
graph_history(history)
Mean Fitness Comparison
To demonstrate the effectiveness of Progressive Dynamic Hurdles, we compare the mean top fitness of each algorithm, with each run 500 times.
In [16]:
import numpy as np
num_trials = 500
print("===========================")
print("Mean Top Fitness Comparison")
print("===========================")
max_fitnesses = []
for _ in range(num_trials):
_, population = plain_evolution(
cycles=TOTAL_RESOURCES, population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE,
early_observation=True)
# Assume all models in the final population are fully evaluated
max_fitness = max([indiv.true_accuracy for indiv in population])
max_fitnesses.append(max_fitness)
max_fitnesses = np.array(max_fitnesses)
print("Early Observation Plain Evolution: %.4s ± %.4s" %
(np.mean(max_fitnesses), np.std(max_fitnesses)))
max_fitnesses = []
for _ in range(num_trials):
_, population = plain_evolution(
cycles=TOTAL_RESOURCES/REDUCTION_FACTOR, population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE,
early_observation=False)
# Assume all models in the final population are fully evaluated
max_fitness = max([indiv.true_accuracy for indiv in population])
max_fitnesses.append(max_fitness)
max_fitnesses = np.array(max_fitnesses)
print("Max Step Observation Plain Evolution: %.4s ± %.4s" %
(np.mean(max_fitnesses), np.std(max_fitnesses)))
max_fitnesses = []
for _ in range(num_trials):
_, population = pdh_evolution(train_resources=TOTAL_RESOURCES,
population_size=POPULATION_SIZE,
sample_size=SAMPLE_SIZE)
# Assume all models in the final population are fully evaluated
max_fitness = max([indiv.true_accuracy for indiv in population])
max_fitnesses.append(max_fitness)
max_fitnesses = np.array(max_fitnesses)
print("Progressive Dynamic Hurdles: %.4s ± %.4s" %
(np.mean(max_fitnesses), np.std(max_fitnesses)))