Smart random exploration

This experiment compares two situations, with smart start and without smart start. In the case without smart start the agent starts in the same state every iteration. In the case with smart start the agent starts in the state which has the lowest kernel density estimation compared to all visited states.

The state with the lowest kernel density is calculated according to $i = \underset{i \in D}{\arg\min}\frac{1}{|D|}\sum_{j \in D}^Ne^{-\sum_{d = 1}^{D_N}(i_d - j_d)^2 / C}$


In [1]:
import numpy as np
import tensorflow as tf
from drl.replaybuffer import ReplayBufferTF
from drl.rrtexploration import Trajectory
from drl.exploration import OrnSteinUhlenbeckNoise
import time

In [7]:
import plotly
import plotly.offline as py
from plotly.graph_objs import *
from plotly import tools
plotly.offline.init_notebook_mode()

scene=Scene(
    xaxis=XAxis(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    ),
    yaxis=YAxis(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    ),
    zaxis=ZAxis(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    )
)


Define environment

Environment is a square field in which a ball can execute continuous actions in the x and y direction.


In [8]:
class DoubleIntegrator(object):
    
    def __init__(self):
        self.viewer = None

        self.w, self.h = 50, 50
        self.plot_density_map = False

        self.plot_trajectory = False

        self.high_state = np.array([5., 5.])
        self.low_state = -self.high_state
        self.high_action = np.array([0.1, 0.1])
        self.low_action = -self.high_action

        self.action_dim = 2
        self.state_dim = 2
        
        self.state = None

    def reset(self, state=None):
        if state is None:
            self.state = np.array([0., 0.])
        else:
            self.state = state
        return self.get_ob()

    def step(self, a):
        s = self.state

        a = np.clip(np.array(a), self.low_action, self.high_action)
        ns = s + a
        self.state = np.clip(ns, self.low_state, self.high_state)

        terminal = False
        reward = 0.

        return self.get_ob(), reward, terminal, {}

    def get_ob(self):
        return self.state

    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500, 500)
            self.viewer.set_bounds(-5.1, 5.1, -5.1, 5.1)

            if self.plot_density_map:
                self.init_plot_density_map()

            self.agent = rendering.make_circle(0.1)
            self.agent.set_color(.8, .3, .3)
            self.agent_transform = rendering.Transform()
            self.agent.add_attr(self.agent_transform)
            self.viewer.add_geom(self.agent)

        self.agent_transform.set_translation(self.state[0], self.state[1])

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def toggle_plot_density(self):
        self.plot_density_map = True

    def init_plot_density_map(self):
        from gym.envs.classic_control import rendering

        cw = 10.2 / self.w
        ch = 10.2 / self.h
        self.density_map = [[0 for x in range(self.w)] for y in range(self.h)]
        offset = 5.
        for x in range(self.w):
            for y in range(self.h):
                l, r, t, b = x * cw - offset, (x + 1) * cw - offset, (y + 1) * ch - offset, y * ch - offset
                v = [(l, b), (l, t), (r, t), (r, b)]
                polygon = rendering.FilledPolygon(v)
                polygon._color.vec4 = (0., 1., 0., 0.)
                self.density_map[x][y] = polygon
                self.viewer.add_geom(polygon)

    def update_density_map(self, values):
        if not self.plot_density_map:
            return False

        max = np.max(values)
        for x in range(self.w):
            for y in range(self.h):
                self.density_map[x][y]._color.vec4 = (0., 1., 0., values[x][y] / max)

    def toggle_plot_trajectories(self):
        self.plot_trajectory = True

    def add_trajectory(self, trajectory):
        from gym.envs.classic_control import rendering

        self.viewer.geoms.remove(self.agent)

        states = trajectory.get_states()
        for i in range(states.shape[0] - 1):
            line = rendering.make_polyline((states[i, :], states[i+1, :]))
            self.viewer.add_geom(line)

        self.viewer.add_geom(self.agent)

Initialization


In [9]:
C = 2.5 # Is a hyper parameter that varies the width of the kernel
buffer_size = 10000
batch_size = 64

num_episodes = 50
max_steps = 100
render_step = False
render_traj = True

# Ornstein Uhlenbeck noise parameters
mu =  0.
sigma = 0.15
theta = 0.2

With smart start


In [10]:
env = DoubleIntegrator()
sess = tf.Session()
buffer = ReplayBufferTF(sess, env.state_dim, env.high_state, C, buffer_size)
noise = OrnSteinUhlenbeckNoise(env.action_dim, mu, sigma, theta)
new_state = None
start = time.time()

if render_traj:
    env.toggle_plot_trajectories()
    env.reset()
    env.render()

for i in range(num_episodes):
    obs = env.reset(new_state)
    trajectory = Trajectory()
    
    for j in range(max_steps):
        if render_step:
            env.render()
            
        action = noise.sample()
        
        trajectory.add_node(obs, action)
        
        next_obs, reward, terminal, info = env.step(action)
        
        buffer.add(obs, action, reward, terminal, next_obs)
        
        obs = next_obs
    
    noise.reset()
    
    if render_traj:
        env.add_trajectory(trajectory)
        env.render()
    
    print("episode: ", i, ', time elapsed: ', (time.time() - start))
        
    new_state = buffer.calc_trans_min_density()[0]


episode:  0 , time elapsed:  0.44001126289367676
episode:  1 , time elapsed:  0.6223878860473633
episode:  2 , time elapsed:  0.7391800880432129
episode:  3 , time elapsed:  0.9060502052307129
episode:  4 , time elapsed:  1.1227834224700928
episode:  5 , time elapsed:  1.389697551727295
episode:  6 , time elapsed:  1.7233095169067383
episode:  7 , time elapsed:  2.158417224884033
episode:  8 , time elapsed:  2.657416582107544
episode:  9 , time elapsed:  3.3079681396484375
episode:  10 , time elapsed:  4.041877746582031
episode:  11 , time elapsed:  4.860556125640869
episode:  12 , time elapsed:  5.777691841125488
episode:  13 , time elapsed:  6.844168186187744
episode:  14 , time elapsed:  7.92838454246521
episode:  15 , time elapsed:  9.229430675506592
episode:  16 , time elapsed:  10.54719591140747
episode:  17 , time elapsed:  12.081791400909424
episode:  18 , time elapsed:  13.801276922225952
episode:  19 , time elapsed:  15.534603834152222
episode:  20 , time elapsed:  17.486205577850342
episode:  21 , time elapsed:  19.50451922416687
episode:  22 , time elapsed:  21.67298674583435
episode:  23 , time elapsed:  24.00963830947876
episode:  24 , time elapsed:  26.476879119873047
episode:  25 , time elapsed:  29.114882230758667
episode:  26 , time elapsed:  31.93135643005371
episode:  27 , time elapsed:  34.91714406013489
episode:  28 , time elapsed:  38.06967854499817
episode:  29 , time elapsed:  41.422435998916626
episode:  30 , time elapsed:  44.992023229599
episode:  31 , time elapsed:  48.745118618011475
episode:  32 , time elapsed:  52.698343992233276
episode:  33 , time elapsed:  56.818384408950806
episode:  34 , time elapsed:  61.205305337905884
episode:  35 , time elapsed:  65.84243607521057
episode:  36 , time elapsed:  70.64633369445801
episode:  37 , time elapsed:  75.66714072227478
episode:  38 , time elapsed:  80.95478844642639
episode:  39 , time elapsed:  86.54268836975098
episode:  40 , time elapsed:  92.31406760215759
episode:  41 , time elapsed:  98.33567452430725
episode:  42 , time elapsed:  104.60747647285461
episode:  43 , time elapsed:  111.24623250961304
episode:  44 , time elapsed:  118.03513193130493
episode:  45 , time elapsed:  125.10759162902832
episode:  46 , time elapsed:  132.43022894859314
episode:  47 , time elapsed:  140.06979823112488
episode:  48 , time elapsed:  148.0263364315033
episode:  49 , time elapsed:  156.24972772598267

Results

Surface plot showing the kernel density over the full state space.


In [11]:
x = np.linspace(-env.high_state[0], env.high_state[0])
y = np.linspace(-env.high_state[1], env.high_state[1])
xx, yy = np.meshgrid(x, y)

values = np.zeros((len(x), len(y)))
for i in range(len(x)):
    for j in range(len(y)):
        values[i, j] = buffer.calc_density(np.array([[xx[i, j], yy[i,j]]]))

In [12]:
surface = Surface(x=xx, y=yy, z=values)
data = Data([surface])

layout = Layout(
    title='Smart start',
    scene=scene
)

fig = Figure(data=data, layout=layout)
py.iplot(fig)


Without smart start


In [13]:
env_no = DoubleIntegrator()
start = time.time()
noise = OrnSteinUhlenbeckNoise(env_no.action_dim, mu, sigma, theta)

buffer_no = ReplayBufferTF(sess, env_no.state_dim, env_no.high_state, C, buffer_size)

if render_traj:
    env_no.toggle_plot_trajectories()
    env_no.reset()
    env_no.render()

for i in range(num_episodes):
    obs = env_no.reset()
    trajectory = Trajectory()
    
    for j in range(max_steps):
        if render_step:
            env_no.render()
            
        action = noise.sample()
        
        trajectory.add_node(obs, action)
        
        next_obs, reward, terminal, info = env_no.step(action)
        
        buffer_no.add(obs, action, reward, terminal, next_obs)
        
        obs = next_obs
        
    noise.reset()
    
    if render_traj:
        env_no.add_trajectory(trajectory)
        env_no.render()
    
    print("episode: ", i, ', time elapsed: ', (time.time() - start))


episode:  0 , time elapsed:  0.08132100105285645
episode:  1 , time elapsed:  0.11314678192138672
episode:  2 , time elapsed:  0.14796066284179688
episode:  3 , time elapsed:  0.17990326881408691
episode:  4 , time elapsed:  0.24651646614074707
episode:  5 , time elapsed:  0.2799382209777832
episode:  6 , time elapsed:  0.29745984077453613
episode:  7 , time elapsed:  0.3136453628540039
episode:  8 , time elapsed:  0.36397314071655273
episode:  9 , time elapsed:  0.4139726161956787
episode:  10 , time elapsed:  0.4467930793762207
episode:  11 , time elapsed:  0.48006200790405273
episode:  12 , time elapsed:  0.5135254859924316
episode:  13 , time elapsed:  0.5467574596405029
episode:  14 , time elapsed:  0.5801780223846436
episode:  15 , time elapsed:  0.6135048866271973
episode:  16 , time elapsed:  0.6468708515167236
episode:  17 , time elapsed:  0.6801984310150146
episode:  18 , time elapsed:  0.7135739326477051
episode:  19 , time elapsed:  0.7482626438140869
episode:  20 , time elapsed:  0.796992301940918
episode:  21 , time elapsed:  0.8472168445587158
episode:  22 , time elapsed:  0.8970737457275391
episode:  23 , time elapsed:  0.9471750259399414
episode:  24 , time elapsed:  0.9971723556518555
episode:  25 , time elapsed:  1.0472075939178467
episode:  26 , time elapsed:  1.0973103046417236
episode:  27 , time elapsed:  1.1476597785949707
episode:  28 , time elapsed:  1.1972618103027344
episode:  29 , time elapsed:  1.2473506927490234
episode:  30 , time elapsed:  1.29732346534729
episode:  31 , time elapsed:  1.347376823425293
episode:  32 , time elapsed:  1.39742112159729
episode:  33 , time elapsed:  1.4474704265594482
episode:  34 , time elapsed:  1.5141849517822266
episode:  35 , time elapsed:  1.5809047222137451
episode:  36 , time elapsed:  1.6476359367370605
episode:  37 , time elapsed:  1.7144384384155273
episode:  38 , time elapsed:  1.7810895442962646
episode:  39 , time elapsed:  1.8480517864227295
episode:  40 , time elapsed:  1.91450834274292
episode:  41 , time elapsed:  1.9812326431274414
episode:  42 , time elapsed:  2.047959327697754
episode:  43 , time elapsed:  2.1146740913391113
episode:  44 , time elapsed:  2.2481324672698975
episode:  45 , time elapsed:  2.314842700958252
episode:  46 , time elapsed:  2.398258924484253
episode:  47 , time elapsed:  2.48165225982666
episode:  48 , time elapsed:  2.5650634765625
episode:  49 , time elapsed:  2.6484737396240234

Results

Surface plot showing the kernel densities over the full state space.


In [14]:
x = np.linspace(-env.high_state[0], env.high_state[0])
y = np.linspace(-env.high_state[1], env.high_state[1])
xx, yy = np.meshgrid(x, y)

values_no = np.zeros((len(x), len(y)))
for i in range(len(x)):
    for j in range(len(y)):
        values_no[i, j] = buffer_no.calc_density(np.array([[xx[i, j], yy[i,j]]]))

In [15]:
surface = Surface(x=xx, y=yy, z=values_no)
data = Data([surface])

layout = Layout(
    title='Normal',
    scene=scene
)

fig = Figure(data=data, layout=layout)
py.iplot(fig)


Close environments


In [16]:
env.render(close=True)

In [17]:
env_no.render(close=True)

Discussion

Smart start explores is a lot better to explore the state space, as expected. There are two main issues though:

1. We assumed you can start at any point previously visited, i.e. in the replay buffer. We assumed that you dan replay previous trajectories to get there, in future work the goal is to go there using iLQG.
2. The time needed to do the kernel density estimates on the replay buffer is way to high, the reason is it does not scale at all, the computation time grows exponentially with the number of samples in the replaybuffer. Two more efficient ways of chosing a exploration point is:
       a. Generate goals as in 'Automatic Goal Generation for Reinforcement Learning Agents, Held et al.'
       b. Store the temporal difference errors for each transition in the replaybuffer, order the buffer based on the TD-error and chose the transition with the highest TD-error as exploration point.