In [0]:
#@title Imports
! pip install --quiet git+git://github.com/deepmind/bsuite
import warnings
from bsuite.experiments import summary_analysis
from bsuite.logging import csv_load
from bsuite.logging import sqlite_load
import numpy as np
import pandas as pd
import plotnine as gg
pd.options.mode.chained_assignment = None
gg.theme_set(gg.theme_bw(base_size=16, base_family='serif'))
gg.theme_update(figure_size=(12, 8), panel_spacing_x=0.5, panel_spacing_y=0.5)
warnings.filterwarnings('ignore')
Load your experiments below. We recommend a maximum of 5 result sets, for clarity of analysis.
The input to the load_bsuite function is a dict that maps from an experiment name of your choosing to the result path.
For an experiment that used CSV logging, this would map to the directory containing the results. For SQLite logging, this would map to the database file for that experiment.
In [0]:
#@title loading results from local data:
experiments = {} # Add results here
DF, SWEEP_VARS = sqlite_load.load_bsuite(experiments)
# Or
# DF, SWEEP_VARS = csv_load.load_bsuite(experiments)
In [0]:
#@title overall score as radar plot (double-click to show/hide code)
BSUITE_SCORE = summary_analysis.bsuite_score(DF, SWEEP_VARS)
BSUITE_SUMMARY = summary_analysis.ave_score_by_tag(BSUITE_SCORE, SWEEP_VARS)
__radar_fig__ = summary_analysis.bsuite_radar_plot(BSUITE_SUMMARY, SWEEP_VARS)
Parsing the plot above:
In [0]:
#@title plotting overall score as bar (double-click to show/hide code)
summary_analysis.bsuite_bar_plot(BSUITE_SCORE, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title compare agent performance on each challenge (double-click to show/hide code)
summary_analysis.bsuite_bar_plot_compare(BSUITE_SCORE, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title Import experiment-specific analysis
from bsuite.experiments.bandit import analysis as bandit_analysis
from bsuite.experiments.bandit_noise import analysis as bandit_noise_analysis
from bsuite.experiments.bandit_scale import analysis as bandit_scale_analysis
from bsuite.experiments.cartpole import analysis as cartpole_analysis
from bsuite.experiments.cartpole_noise import analysis as cartpole_noise_analysis
from bsuite.experiments.cartpole_scale import analysis as cartpole_scale_analysis
from bsuite.experiments.cartpole_swingup import analysis as cartpole_swingup_analysis
from bsuite.experiments.catch import analysis as catch_analysis
from bsuite.experiments.catch_noise import analysis as catch_noise_analysis
from bsuite.experiments.catch_scale import analysis as catch_scale_analysis
from bsuite.experiments.deep_sea import analysis as deep_sea_analysis
from bsuite.experiments.deep_sea_stochastic import analysis as deep_sea_stochastic_analysis
from bsuite.experiments.discounting_chain import analysis as discounting_chain_analysis
from bsuite.experiments.memory_len import analysis as memory_len_analysis
from bsuite.experiments.memory_size import analysis as memory_size_analysis
from bsuite.experiments.mnist import analysis as mnist_analysis
from bsuite.experiments.mnist_noise import analysis as mnist_noise_analysis
from bsuite.experiments.mnist_scale import analysis as mnist_scale_analysis
from bsuite.experiments.mountain_car import analysis as mountain_car_analysis
from bsuite.experiments.mountain_car_noise import analysis as mountain_car_noise_analysis
from bsuite.experiments.mountain_car_scale import analysis as mountain_car_scale_analysis
from bsuite.experiments.umbrella_distract import analysis as umbrella_distract_analysis
from bsuite.experiments.umbrella_length import analysis as umbrella_length_analysis
A simple independent-armed bandit problem.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
bandit_df = DF[DF.bsuite_env == 'bandit'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'bandit', SWEEP_VARS).draw();
In [0]:
#@title plot average regret through learning (lower is better)
bandit_analysis.plot_learning(bandit_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
bandit_analysis.plot_seeds(bandit_df, SWEEP_VARS).draw();
The "hello world" of deep learning, now as a contextual bandit.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mnist_df = DF[DF.bsuite_env == 'mnist'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mnist', SWEEP_VARS).draw();
In [0]:
#@title plot average regret through learning (lower is better)
mnist_analysis.plot_learning(mnist_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mnist_analysis.plot_seeds(mnist_df, SWEEP_VARS).draw();
Parsing the plot above:
DeepMind's internal "hello world" for RL agents.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
catch_df = DF[DF.bsuite_env == 'catch'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'catch', SWEEP_VARS).draw();
In [0]:
#@title plot average regret through learning (lower is better)
catch_analysis.plot_learning(catch_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
catch_analysis.plot_seeds(catch_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls an underpowered car and must drive it out of a valley.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mountain_car_df = DF[DF.bsuite_env == 'mountain_car'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mountain_car', SWEEP_VARS).draw();
In [0]:
#@title plot average regret through learning (lower is better)
mountain_car_analysis.plot_learning(mountain_car_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mountain_car_analysis.plot_seeds(mountain_car_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls a cart on a frictionless plane.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
cartpole_df = DF[DF.bsuite_env == 'cartpole'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'cartpole', SWEEP_VARS).draw();
In [0]:
#@title plot average regret through learning (lower is better)
cartpole_analysis.plot_learning(cartpole_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
cartpole_analysis.plot_seeds(cartpole_df, SWEEP_VARS).draw();
Parsing the plot above:
To investigate the robustness of RL agents to noisy rewards, we repeat the "basic" experiments under differing levels of Gaussian noise.
This time we allocate the 20 different seeds across 5 levels of Gaussian noise $N(0, \sigma^2)$ for $\sigma$ = noise_scale = $[0.1, 0.3, 1, 3, 10]$ with 4 seeds each.
A simple independent-armed bandit problem.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
bandit_noise_df = DF[DF.bsuite_env == 'bandit_noise'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'bandit_noise', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
bandit_noise_analysis.plot_average(bandit_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
bandit_noise_analysis.plot_learning(bandit_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
bandit_noise_analysis.plot_seeds(bandit_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
The "hello world" of deep learning, now as a contextual bandit.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mnist_noise_df = DF[DF.bsuite_env == 'mnist_noise'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mnist_noise', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
mnist_noise_analysis.plot_average(mnist_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
mnist_noise_analysis.plot_learning(mnist_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mnist_noise_analysis.plot_seeds(mnist_noise_df, SWEEP_VARS).draw;
Parsing the plot above:
DeepMind's internal "hello world" for RL agents.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
catch_noise_df = DF[DF.bsuite_env == 'catch_noise'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'catch_noise', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
catch_noise_analysis.plot_average(catch_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
catch_noise_analysis.plot_learning(catch_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
catch_noise_analysis.plot_seeds(catch_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls an underpowered car and must drive it out of a valley.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mountain_car_noise_df = DF[DF.bsuite_env == 'mountain_car_noise'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mountain_car_noise', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
mountain_car_noise_analysis.plot_average(mountain_car_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
mountain_car_noise_analysis.plot_learning(mountain_car_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mountain_car_noise_analysis.plot_seeds(mountain_car_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls a cart on a frictionless plane.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
cartpole_noise_df = DF[DF.bsuite_env == 'cartpole_noise'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'cartpole_noise', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
cartpole_noise_analysis.plot_average(cartpole_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
cartpole_noise_analysis.plot_learning(cartpole_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
cartpole_noise_analysis.plot_seeds(cartpole_noise_df, SWEEP_VARS).draw();
Parsing the plot above:
To investigate the robustness of RL agents to reward rewards, we repeat the "basic" experiments under differing levels of problem rescaling.
This time we allocate the 20 different seeds across 5 levels of reward_scale = $[0.1, 0.3, 1, 3, 10]$ with 4 seeds each.
In order to keep comparable statistics/regret we report rescaled regret/reward_scale.
A simple independent-armed bandit problem.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
bandit_scale_df = DF[DF.bsuite_env == 'bandit_scale'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'bandit_scale', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
bandit_scale_analysis.plot_average(bandit_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
bandit_scale_analysis.plot_learning(bandit_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
bandit_scale_analysis.plot_seeds(bandit_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
The "hello world" of deep learning, now as a contextual bandit.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mnist_scale_df = DF[DF.bsuite_env == 'mnist_scale'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mnist_scale', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
mnist_scale_analysis.plot_average(mnist_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
mnist_scale_analysis.plot_learning(mnist_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mnist_scale_analysis.plot_seeds(mnist_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
DeepMind's internal "hello world" for RL agents.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
catch_scale_df = DF[DF.bsuite_env == 'catch_scale'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'catch_scale', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
catch_scale_analysis.plot_average(catch_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
catch_scale_analysis.plot_learning(catch_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
catch_scale_analysis.plot_seeds(catch_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls an underpowered car and must drive it out of a valley.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
mountain_car_scale_df = DF[DF.bsuite_env == 'mountain_car_scale'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mountain_car_scale', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
mountain_car_scale_analysis.plot_average(mountain_car_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
mountain_car_scale_analysis.plot_learning(mountain_car_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
mountain_car_scale_analysis.plot_seeds(mountain_car_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
A classic benchmark problem in RL. The agent controls a cart on a frictionless plane.
episode, total_regret for standard analysis.
In [0]:
#@title parsing data
cartpole_scale_df = DF[DF.bsuite_env == 'cartpole_scale'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'cartpole_scale', SWEEP_VARS).draw();
In [0]:
#@title average regret over learning (lower is better)
cartpole_scale_analysis.plot_average(cartpole_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
cartpole_scale_analysis.plot_learning(cartpole_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
cartpole_scale_analysis.plot_seeds(cartpole_scale_df, SWEEP_VARS).draw();
Parsing the plot above:
Scalable chain domains that test for deep exploration.
The environment is an N x N grid with falling blocks similar to catch. However the block always starts in the top left. In each timestep, the agent can move the block "left" or "right". At each timestep, there is a small cost for moving "right" and no cost for moving "left". However, the agent can receive a large reward for choosing "right" N-times in a row and reaching the bottom right. This is the single rewarding policy, all other policies receive zero or negative return making this a very difficult exploration problem.
episode, total_return for standard analysis.
In [0]:
#@title parsing data
deep_sea_df = DF[DF.bsuite_env == 'deep_sea'].copy()
deep_sea_plt = deep_sea_analysis.find_solution(deep_sea_df, SWEEP_VARS)
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'deep_sea', SWEEP_VARS).draw();
In [0]:
#@title average regret by size through learning (lower is better)
deep_sea_analysis.plot_regret(deep_sea_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title scaling of learning time with deep_sea size (lower + more blue is better)
deep_sea_analysis.plot_scaling(deep_sea_plt, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title scaling of learning time with deep_sea size on log scale (lower + more blue is better)
deep_sea_analysis.plot_scaling_log(deep_sea_plt, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
deep_sea_analysis.plot_seeds(deep_sea_df, SWEEP_VARS).draw();
Parsing the plot above:
Scalable chain domains that test for deep exploration.
The environment is an N x N grid with falling blocks similar to catch. However the block always starts in the top left. In each timestep, the agent can move the block "left" or "right". At each timestep, there is a small cost for moving "right" and no cost for moving "left". However, the agent can receive a large reward for choosing "right" N-times in a row and reaching the bottom right. This is the single rewarding policy, all other policies receive zero or negative return making this a very difficult exploration problem.
The stochastic version of this domain only transitions to the right with probability (1 - 1/N) and adds N(0,1) noise to the 'end' states of the chain.
episode, total_return for standard analysis.
In [0]:
#@title parsing data
deep_sea_stochastic_df = DF[DF.bsuite_env == 'deep_sea_stochastic'].copy()
deep_sea_stochastic_plt = deep_sea_stochastic_analysis.find_solution(deep_sea_stochastic_df, SWEEP_VARS)
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'deep_sea_stochastic', SWEEP_VARS).draw();
In [0]:
#@title average regret by size through learning (lower is better)
deep_sea_stochastic_analysis.plot_regret(deep_sea_stochastic_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title scaling of learning time with deep_sea_stochastic size (lower + more blue is better)
deep_sea_stochastic_analysis.plot_scaling(deep_sea_stochastic_plt, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title scaling of learning time with deep_sea size on log scale (lower + more blue is better)
deep_sea_stochastic_analysis.plot_scaling_log(deep_sea_stochastic_plt, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
deep_sea_stochastic_analysis.plot_seeds(deep_sea_stochastic_df, SWEEP_VARS).draw();
Parsing the plot above:
A difficult cartpole swingup task with sparse rewards and a cost for moving. This domain is somewhat similar to "deep sea" but cannot be solved easily by tabular reinforcement learning algorithms.
[x, cos_theta, sin_theta, x_dot, theta_dot, x_central]difficulty_scaledifficulty_scaleThe parameter difficulty_scale acts as a scaling for the depth of exploration, similar to the "size" in deep sea.
To run this experiment:
episode, total_return for standard analysis
In [0]:
#@title parsing data
cartpole_swingup_df = DF[DF.bsuite_env == 'cartpole_swingup'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'cartpole_swingup', SWEEP_VARS).draw();
In [0]:
#@title scaling with difficulty scale (higher + more blue is better)
cartpole_swingup_analysis.plot_scale(cartpole_swingup_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
cartpole_swingup_analysis.plot_learning(cartpole_swingup_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
cartpole_swingup_analysis.plot_seeds(cartpole_swingup_df, SWEEP_VARS).draw();
Parsing the plot above:
A stylized problem designed to highlight problems to do with temporal credit assignment and scaling with time horizon.
The experiment setup:
episode, total_return, total_regret for standard analysis.
In [0]:
#@title parsing data
umbrella_length_df = DF[DF.bsuite_env == 'umbrella_length'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'umbrella_length', SWEEP_VARS).draw();
In [0]:
#@title average regret after 10k episodes (lower is better)
umbrella_length_analysis.plot_scale(umbrella_length_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
umbrella_length_analysis.plot_learning(umbrella_length_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
umbrella_length_analysis.plot_seeds(umbrella_length_df, SWEEP_VARS).draw();
Parsing the plot above:
A stylized problem designed to highlight problems to do with temporal credit assignment and scaling with time horizon.
The experiment setup:
episode, total_return, total_regret for standard analysis.
In [0]:
#@title parsing data
umbrella_distract_df = DF[DF.bsuite_env == 'umbrella_distract'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'umbrella_distract', SWEEP_VARS).draw();
In [0]:
#@title average regret after 10k episodes (lower is better)
umbrella_distract_analysis.plot_scale(umbrella_distract_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
umbrella_distract_analysis.plot_learning(umbrella_distract_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
umbrella_distract_analysis.plot_seeds(umbrella_distract_df, SWEEP_VARS).draw();
Parsing the plot above:
A stylized problem designed to highlight an agent's ability to correctly maximize cumulative rewards without discounting bias.
The experiment setup:
episode, total_return for standard analysis
In [0]:
#@title parsing data
discounting_chain_df = DF[DF.bsuite_env == 'discounting_chain'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'discounting_chain', SWEEP_VARS).draw();
In [0]:
#@title average regret after 1k episodes (lower is better)
discounting_chain_analysis.plot_average(discounting_chain_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
discounting_chain_analysis.plot_learning(discounting_chain_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
discounting_chain_analysis.plot_seeds(discounting_chain_df, SWEEP_VARS).draw();
Parsing the plot above:
A stylized T-maze problem designed to highlight an agent's ability to remember important information and use it to make good decisions.
The experiment setup:
episode, total_return for standard analysis
In [0]:
#@title parsing data
memory_len_df = DF[DF.bsuite_env == 'memory_len'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'memory_len', SWEEP_VARS).draw();
In [0]:
#@title memory scaling (lower + more blue is better)
memory_len_analysis.plot_scale(memory_len_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
memory_len_analysis.plot_learning(memory_len_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
memory_len_analysis.plot_seeds(memory_len_df, SWEEP_VARS).draw();
Parsing the plot above:
A stylized T-maze problem designed to highlight an agent's ability to remember important information and use it to make good decisions.
0 and num_bits-1 and must select the correct action corresponding to context[query].The experiment setup:
episode, total_return for standard analysis
In [0]:
#@title parsing data
memory_size_df = DF[DF.bsuite_env == 'memory_size'].copy()
summary_analysis.plot_single_experiment(BSUITE_SCORE, 'memory_size', SWEEP_VARS).draw();
In [0]:
#@title memory scaling (lower + more blue is better)
memory_size_analysis.plot_scale(memory_size_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title average regret through learning (lower is better)
memory_size_analysis.plot_learning(memory_size_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
#@title plot performance by seed (higher is better)
memory_size_analysis.plot_seeds(memory_size_df, SWEEP_VARS).draw();
Parsing the plot above:
In [0]:
import os
from google.colab import files
# Save images required for the reports in an `images/` folder.
if not os.path.exists('images'):
os.makedirs('images')
__radar_fig__.savefig('images/radar_plot.png', bbox_inches="tight")
# Compress folder and download
!zip -r /images.zip /content/images > /dev/null
try:
files.download("images.zip")
except:
pass