In [1]:
%matplotlib inline

In [2]:
import gym
import tensorflow as tf
import numpy as np
import math
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import random
import time

In [3]:
from experiencereplay import ExperienceReplay, PrioritizedExperienceReplay
from experiment import Experiment
from ddpg import DDPG
import nn
from exploration import EpsilonGreedyStrategy, OUStrategy
from ounoise import OUNoise

In [4]:
settings = dict(
    environment = 'MountainCarContinuous-v0',
    timesteps = 8000,
    max_replay_buffer_size = 100000,
    batch_size = 64,
    learning_start = 256,
    discount_factor = 0.99,

    actor_learning_rate=0.0001,
    q_learning_rate=0.001,
    
    actor_l2=None,
    q_l2=None,
    
    actor_target_approach_rate=0.99,
    q_target_approach_rate=0.99,
    
    train_updates_per_step = 10,
    priority_updates_per_step = 100,
    
    actor_net_layers = [256, 128],
    actor_net_activation_fn = tf.nn.elu,
    actor_bounded_output = True,
    q_net_layers =  [128],
    q_net_embedding = 128,
    q_net_activation_fn = tf.nn.elu,
 
    environment_seed = 0,
    noise_seed= 0,
    
    gpu_memory_fraction = 0.1,
    
    render_environment = True,
    render_frequency = 10,
    render_start = 3500,
)

settings["experiment_path"] = "experiments/experiment_ddpg_{}_{}".format(settings["environment"], int(time.time()))
settings["actor_tf_optimizer"] = tf_optimizer = tf.train.AdamOptimizer(settings["actor_learning_rate"])
settings["q_tf_optimizer"] = tf_optimizer = tf.train.AdamOptimizer(settings["q_learning_rate"])

In [5]:
def preprocess_state(observation):
    state = np.array(observation)
    if settings["environment"] == "MountainCarContinuous-v0":
        state[1] = state[1] * 10
        return state
    else:
        return state
    
def preprocess_reward(reward):
    return reward

In [6]:
env = gym.make(settings["environment"])
env.seed(settings["environment_seed"])

observation = preprocess_state(env.reset())
state = observation


[2017-05-22 01:30:59,685] Making new env: MountainCarContinuous-v0

In [7]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

print(state_dim)
print(action_dim)
print(env.observation_space.high)
print(env.observation_space.low)
print(env.action_space.high)
print(env.action_space.low)


2
1
[ 0.6   0.07]
[-1.2  -0.07]
[ 1.]
[-1.]

In [8]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=settings["gpu_memory_fraction"])
session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

In [9]:
actor_network = nn.fully_connected("Actor", session, [state_dim], settings["actor_net_layers"],
                                   action_dim, settings["actor_net_activation_fn"],
                                   env.action_space if settings["actor_bounded_output"] else None, False)
q_network = nn.fully_connected_with_input_embedding(
    "Q", session, [state_dim, action_dim], settings["q_net_embedding"], settings["q_net_layers"], 1,
                               settings["q_net_activation_fn"], None, False)

print(str(actor_network))


[] --> Actor_input_0
['Actor_input_0'] --> hidden_0_256
['hidden_0_256'] --> hidden_1_128
['hidden_1_128'] --> output
['output'] --> bounding


In [10]:
agent = DDPG(actor_network, q_network,
             discount_factor=settings["discount_factor"],
             actor_tf_optimizer=settings["actor_tf_optimizer"],
             q_tf_optimizer=settings["q_tf_optimizer"],
             actor_l2=settings["actor_l2"],
             q_l2=settings["q_l2"],
             actor_target_approach_rate=settings["actor_target_approach_rate"],
             q_target_approach_rate=settings["q_target_approach_rate"]
            )

In [11]:
experience_replay = PrioritizedExperienceReplay(agent, env, settings["max_replay_buffer_size"], False)

In [12]:
noise = OUNoise(action_dim, seed=settings["noise_seed"])

In [13]:
exp = Experiment(settings["experiment_path"], session, env, settings, settings["render_environment"], settings["render_frequency"], settings["render_start"])

progress_bar = tqdm(total=settings["timesteps"])

for t in xrange(settings["timesteps"]):   
    if t < settings["learning_start"]:
        action = noise.noise()
    else:
        action = agent.action(state)
    
    observation, reward, done, info = env.step(action)
    next_state = np.reshape(preprocess_state(observation), (state_dim,))
    
    experience_replay.add_experience(state, action, preprocess_reward(reward), next_state, done)
    td_error = math.fabs(experience_replay.get_last_td_error())
    exp.record(t, state, action, reward, next_state, done, td_error)
    
    state = next_state
    
    if done:
        exp.print_last_episode_info()
        observation = env.reset()
        state = preprocess_state(observation)
        
    if t >= settings["learning_start"]:
        experience_replay.train_agent(settings["batch_size"], settings["train_updates_per_step"])
        experience_replay.update_oldest_priorities(settings["priority_updates_per_step"])
            
    progress_bar.set_description('[{}] reward: {:.2f}, reward 100-step MA: {:.2f}, action: {}, td-error: {:.4f}' \
        .format(t, reward, exp.reward_100ma.get_average(), str(action), td_error))
    progress_bar.update()
    
        
progress_bar.close()


[177] reward: -0.00, reward 100-step MA: 0.98, action: [-0.21955845], td-error: 0.0000:   2%|▏         | 178/8000 [00:00<09:58, 13.08it/s]    | 1/8000 [00:00<41:30,  3.21it/s]
Total episode reward: 96.4231978292. Finished in 130 steps.
[307] reward: -0.09, reward 100-step MA: -0.06, action: [ 0.96571648], td-error: 0.0000:   4%|▍         | 308/8000 [00:05<08:25, 15.21it/s]256] reward: -0.11, reward 100-step MA: -0.01, action: [ 1.03491187], td-error: 0.0000:   3%|▎         | 257/8000 [00:01<05:12, 24.76it/s]
Total episode reward: 92.7599452618. Finished in 181 steps.
[2159] reward: 99.91, reward 100-step MA: 0.96, action: [-0.97013456], td-error: 98.2218:  27%|██▋       | 2160/8000 [02:57<09:11, 10.59it/s]33] reward: -0.09, reward 100-step MA: 0.92, action: [ 0.97408241], td-error: 0.0000:   4%|▍         | 334/8000 [00:08<09:56, 12.85it/s]
Total episode reward: 8.25764464207. Finished in 1849 steps.
[2839] reward: 99.92, reward 100-step MA: 0.92, action: [-0.87456381], td-error: 98.6304:  36%|███▌      | 2840/8000 [04:01<08:07, 10.59it/s][2163] reward: -0.09, reward 100-step MA: 0.95, action: [ 0.95159531], td-error: 0.0000:  27%|██▋       | 2164/8000 [02:58<09:07, 10.66it/s]
Total episode reward: 48.8485913878. Finished in 680 steps.
[2943] reward: -0.00, reward 100-step MA: 0.92, action: [ 0.11740303], td-error: 0.0000:  37%|███▋      | 2944/8000 [04:11<07:44, 10.90it/s] [2843] reward: -0.03, reward 100-step MA: 0.92, action: [ 0.50309128], td-error: 0.0000:  36%|███▌      | 2844/8000 [04:01<08:04, 10.65it/s]
Total episode reward: 92.158596952. Finished in 103 steps.
[3069] reward: -0.06, reward 100-step MA: 0.92, action: [ 0.80320817], td-error: 0.0000:  38%|███▊      | 3070/8000 [04:22<07:48, 10.53it/s] 2947] reward: -0.00, reward 100-step MA: 0.93, action: [ 0.13784266], td-error: 0.0000:  37%|███▋      | 2948/8000 [04:11<07:44, 10.87it/s]
Total episode reward: 91.1983401741. Finished in 126 steps.
[3181] reward: 99.97, reward 100-step MA: 0.93, action: [ 0.50303936], td-error: 99.8536:  40%|███▉      | 3182/8000 [04:33<07:23, 10.86it/s]3073] reward: -0.07, reward 100-step MA: 0.92, action: [ 0.8408705], td-error: 0.0000:  38%|███▊      | 3074/8000 [04:23<07:38, 10.75it/s] 
Total episode reward: 91.625918678. Finished in 113 steps.
[3297] reward: -0.00, reward 100-step MA: 0.93, action: [-0.10780644], td-error: 0.0000:  41%|████      | 3298/8000 [04:44<07:16, 10.78it/s] [3185] reward: -0.04, reward 100-step MA: 0.93, action: [ 0.66720861], td-error: 0.0000:  40%|███▉      | 3186/8000 [04:33<07:23, 10.85it/s]
Total episode reward: 91.7372396827. Finished in 115 steps.
[3411] reward: 99.98, reward 100-step MA: 0.94, action: [ 0.48037547], td-error: 100.2430:  43%|████▎     | 3412/8000 [04:54<07:25, 10.31it/s]301] reward: -0.00, reward 100-step MA: 0.93, action: [ 0.05319721], td-error: 0.0000:  41%|████▏     | 3302/8000 [04:44<07:19, 10.69it/s]
Total episode reward: 93.9884714781. Finished in 115 steps.
[3559] reward: -0.03, reward 100-step MA: 0.94, action: [ 0.53071296], td-error: 0.0000:  44%|████▍     | 3560/8000 [05:09<07:03, 10.49it/s]  [3415] reward: -0.09, reward 100-step MA: 0.94, action: [-0.96730715], td-error: 0.8575:  43%|████▎     | 3416/8000 [04:55<07:15, 10.53it/s]
Total episode reward: 92.5132563255. Finished in 147 steps.
[3669] reward: -0.07, reward 100-step MA: 0.93, action: [ 0.84691906], td-error: 0.0000:  46%|████▌     | 3670/8000 [05:20<06:44, 10.70it/s] 3563] reward: -0.06, reward 100-step MA: 0.94, action: [ 0.78599143], td-error: 0.0000:  45%|████▍     | 3564/8000 [05:09<07:01, 10.52it/s]
Total episode reward: 92.693000711. Finished in 110 steps.
[3815] reward: 99.99, reward 100-step MA: 0.96, action: [ 0.34155673], td-error: 100.4361:  48%|████▊     | 3816/8000 [05:33<06:35, 10.59it/s]673] reward: -0.08, reward 100-step MA: 0.93, action: [ 0.90532643], td-error: 0.0000:  46%|████▌     | 3674/8000 [05:20<06:48, 10.60it/s]
Total episode reward: 92.2462467649. Finished in 147 steps.
[3913] reward: 99.99, reward 100-step MA: 1.95, action: [ 0.36616325], td-error: 100.3555:  49%|████▉     | 3914/8000 [05:43<06:33, 10.38it/s][3819] reward: -0.00, reward 100-step MA: 0.97, action: [-0.21762282], td-error: 0.0000:  48%|████▊     | 3820/8000 [05:34<06:31, 10.68it/s]
Total episode reward: 95.0193921351. Finished in 98 steps.
[3997] reward: 99.98, reward 100-step MA: 1.96, action: [ 0.46925616], td-error: 100.3180:  50%|████▉     | 3998/8000 [05:51<06:20, 10.51it/s][3917] reward: -0.09, reward 100-step MA: 0.95, action: [-0.96975958], td-error: 0.5279:  49%|████▉     | 3918/8000 [05:43<06:27, 10.54it/s]
Total episode reward: 95.9540731023. Finished in 84 steps.
[4151] reward: -0.00, reward 100-step MA: 0.97, action: [-0.13247102], td-error: 0.0000:  52%|█████▏    | 4152/8000 [06:05<06:07, 10.47it/s]  [4001] reward: -0.01, reward 100-step MA: 1.96, action: [-0.38163775], td-error: 0.0875:  50%|█████     | 4002/8000 [05:51<06:27, 10.32it/s]
Total episode reward: 97.2207880179. Finished in 153 steps.
[4249] reward: -0.01, reward 100-step MA: 1.95, action: [ 0.30502889], td-error: 0.0000:  53%|█████▎    | 4250/8000 [06:14<06:01, 10.38it/s][4155] reward: -0.01, reward 100-step MA: 0.97, action: [-0.27297813], td-error: 0.0000:  52%|█████▏    | 4156/8000 [06:05<06:03, 10.59it/s]
Total episode reward: 95.1885289408. Finished in 98 steps.
[4371] reward: 99.90, reward 100-step MA: 0.93, action: [-1.00593388], td-error: 100.8507:  55%|█████▍    | 4372/8000 [06:26<05:50, 10.35it/s]253] reward: -0.02, reward 100-step MA: 0.95, action: [ 0.39229238], td-error: 0.0000:  53%|█████▎    | 4254/8000 [06:15<05:54, 10.58it/s]
Total episode reward: 92.7324281326. Finished in 123 steps.
[4457] reward: -0.01, reward 100-step MA: 1.95, action: [-0.3517307], td-error: 0.0000:  56%|█████▌    | 4458/8000 [06:34<05:40, 10.42it/s]   [4375] reward: -0.07, reward 100-step MA: 0.93, action: [-0.83842325], td-error: 0.9273:  55%|█████▍    | 4376/8000 [06:26<05:50, 10.33it/s]
Total episode reward: 95.8260672685. Finished in 85 steps.
[4755] reward: -0.04, reward 100-step MA: 0.95, action: [-0.63292646], td-error: 0.2361:  59%|█████▉    | 4756/8000 [07:03<05:11, 10.40it/s] 4461] reward: -0.01, reward 100-step MA: 1.95, action: [-0.36525297], td-error: 0.0000:  56%|█████▌    | 4462/8000 [06:35<05:39, 10.41it/s]
Total episode reward: 93.035235595. Finished in 298 steps.
[4841] reward: 99.98, reward 100-step MA: 1.94, action: [ 0.39141309], td-error: 100.3855:  61%|██████    | 4842/8000 [07:11<04:56, 10.66it/s]759] reward: -0.09, reward 100-step MA: 0.95, action: [-0.96538091], td-error: 0.2859:  60%|█████▉    | 4760/8000 [07:03<05:08, 10.52it/s]
Total episode reward: 94.8312813484. Finished in 87 steps.
[4925] reward: 99.98, reward 100-step MA: 1.96, action: [ 0.49004173], td-error: 100.3243:  62%|██████▏   | 4926/8000 [07:19<04:47, 10.71it/s][4845] reward: -0.09, reward 100-step MA: 1.94, action: [-0.93365896], td-error: 0.6885:  61%|██████    | 4846/8000 [07:11<04:54, 10.70it/s]
Total episode reward: 96.0140516063. Finished in 84 steps.
[5079] reward: 99.98, reward 100-step MA: 0.95, action: [ 0.39531633], td-error: 100.3240:  64%|██████▎   | 5080/8000 [07:33<04:38, 10.49it/s][4929] reward: -0.04, reward 100-step MA: 1.96, action: [-0.64737892], td-error: 0.0000:  62%|██████▏   | 4930/8000 [07:19<04:48, 10.65it/s]
Total episode reward: 92.9024497463. Finished in 154 steps.
[5163] reward: 99.97, reward 100-step MA: 1.95, action: [ 0.55646342], td-error: 100.2253:  65%|██████▍   | 5164/8000 [07:41<04:31, 10.45it/s][5083] reward: -0.10, reward 100-step MA: 0.95, action: [-1.01687372], td-error: 0.0000:  64%|██████▎   | 5084/8000 [07:34<04:44, 10.26it/s]
Total episode reward: 95.4212130095. Finished in 84 steps.
[5247] reward: -0.06, reward 100-step MA: 1.95, action: [-0.7753799], td-error: 0.4614:  66%|██████▌   | 5248/8000 [07:49<04:20, 10.57it/s]   [5167] reward: -0.09, reward 100-step MA: 1.95, action: [-0.96202737], td-error: 0.0000:  65%|██████▍   | 5168/8000 [07:42<04:24, 10.69it/s]
Total episode reward: 95.6067360548. Finished in 83 steps.
[5329] reward: 99.98, reward 100-step MA: 1.95, action: [ 0.39217404], td-error: 100.3029:  67%|██████▋   | 5330/8000 [07:57<04:16, 10.42it/s]51] reward: -0.11, reward 100-step MA: 1.95, action: [-1.02692771], td-error: 0.1899:  66%|██████▌   | 5252/8000 [07:50<04:23, 10.43it/s]
Total episode reward: 95.542213514. Finished in 83 steps.
[5413] reward: -0.08, reward 100-step MA: 1.95, action: [-0.89246649], td-error: 0.2957:  68%|██████▊   | 5414/8000 [08:05<04:00, 10.73it/s]  [5333] reward: -0.08, reward 100-step MA: 1.95, action: [-0.90938842], td-error: 0.1973:  67%|██████▋   | 5334/8000 [07:58<04:14, 10.47it/s]
Total episode reward: 95.2691523042. Finished in 83 steps.
[5495] reward: -0.10, reward 100-step MA: 1.94, action: [-1.0229435], td-error: 0.0000:  69%|██████▊   | 5496/8000 [08:13<04:01, 10.37it/s] [5417] reward: -0.11, reward 100-step MA: 1.95, action: [-1.02978289], td-error: 0.0000:  68%|██████▊   | 5418/8000 [08:06<04:00, 10.71it/s]
Total episode reward: 94.5354219415. Finished in 82 steps.
[5579] reward: 99.96, reward 100-step MA: 1.94, action: [ 0.6225996], td-error: 100.0896:  70%|██████▉   | 5580/8000 [08:21<03:55, 10.29it/s]5499] reward: -0.10, reward 100-step MA: 1.94, action: [-0.99046183], td-error: 0.0000:  69%|██████▉   | 5500/8000 [08:13<03:56, 10.56it/s]
Total episode reward: 94.4167161511. Finished in 85 steps.
[5659] reward: -0.13, reward 100-step MA: 1.94, action: [-1.13845289], td-error: 0.0000:  71%|███████   | 5660/8000 [08:29<03:45, 10.36it/s] [5583] reward: -0.12, reward 100-step MA: 1.94, action: [-1.07340693], td-error: 0.0000:  70%|██████▉   | 5584/8000 [08:21<03:53, 10.33it/s]
Total episode reward: 95.1464687986. Finished in 79 steps.
[5747] reward: -0.09, reward 100-step MA: 1.94, action: [-0.9601081], td-error: 0.0311:  72%|███████▏  | 5748/8000 [08:37<03:32, 10.59it/s] [5663] reward: -0.12, reward 100-step MA: 1.94, action: [-1.0841912], td-error: 0.0000:  71%|███████   | 5664/8000 [08:29<03:43, 10.44it/s] 
Total episode reward: 94.2133339555. Finished in 88 steps.
[5827] reward: 99.98, reward 100-step MA: 1.94, action: [ 0.46903309], td-error: 100.1296:  73%|███████▎  | 5828/8000 [08:45<03:24, 10.61it/s]751] reward: -0.09, reward 100-step MA: 1.94, action: [-0.94492143], td-error: 0.0000:  72%|███████▏  | 5752/8000 [08:37<03:35, 10.44it/s]
Total episode reward: 94.9310086949. Finished in 81 steps.
[5907] reward: 99.97, reward 100-step MA: 1.94, action: [ 0.51486492], td-error: 100.0965:  74%|███████▍  | 5908/8000 [08:52<03:21, 10.36it/s][5831] reward: -0.09, reward 100-step MA: 1.94, action: [-0.95784825], td-error: 0.0000:  73%|███████▎  | 5832/8000 [08:45<03:23, 10.67it/s]
Total episode reward: 94.7047823062. Finished in 80 steps.
[5987] reward: -0.04, reward 100-step MA: 1.94, action: [-0.65630955], td-error: 0.0000:  75%|███████▍  | 5988/8000 [09:00<03:17, 10.16it/s]  [5911] reward: -0.12, reward 100-step MA: 1.94, action: [-1.10545516], td-error: 0.0000:  74%|███████▍  | 5912/8000 [08:53<03:21, 10.34it/s]
Total episode reward: 94.6738877204. Finished in 79 steps.
[6067] reward: -0.09, reward 100-step MA: 1.94, action: [-0.94778693], td-error: 0.0000:  76%|███████▌  | 6068/8000 [09:08<03:06, 10.35it/s][5991] reward: -0.12, reward 100-step MA: 1.94, action: [-1.08704591], td-error: 0.0421:  75%|███████▍  | 5992/8000 [09:00<03:18, 10.14it/s]
Total episode reward: 94.5869934792. Finished in 80 steps.
[6149] reward: 99.96, reward 100-step MA: 1.94, action: [-0.65713215], td-error: 99.8057:  77%|███████▋  | 6150/8000 [09:16<02:55, 10.52it/s]6071] reward: -0.10, reward 100-step MA: 1.94, action: [-1.00193417], td-error: 0.0407:  76%|███████▌  | 6072/8000 [09:08<03:09, 10.20it/s]
Total episode reward: 94.2203140627. Finished in 83 steps.
[6229] reward: 99.93, reward 100-step MA: 1.93, action: [-0.85575598], td-error: 99.8588:  78%|███████▊  | 6230/8000 [09:23<02:45, 10.69it/s][6153] reward: -0.10, reward 100-step MA: 1.93, action: [-0.9988296], td-error: 0.0971:  77%|███████▋  | 6154/8000 [09:16<02:53, 10.61it/s] 
Total episode reward: 94.2034539632. Finished in 80 steps.
[6307] reward: 99.89, reward 100-step MA: 1.93, action: [-1.04406977], td-error: 99.8618:  79%|███████▉  | 6308/8000 [09:31<02:42, 10.43it/s][6233] reward: -0.09, reward 100-step MA: 1.93, action: [-0.93090022], td-error: 0.1263:  78%|███████▊  | 6234/8000 [09:24<02:48, 10.49it/s]
Total episode reward: 94.2401164427. Finished in 78 steps.
[6385] reward: -0.09, reward 100-step MA: 1.93, action: [-0.96210933], td-error: 0.0000:  80%|███████▉  | 6386/8000 [09:38<02:31, 10.62it/s] [6311] reward: -0.11, reward 100-step MA: 1.93, action: [-1.06415105], td-error: 0.0000:  79%|███████▉  | 6312/8000 [09:31<02:44, 10.26it/s]
Total episode reward: 94.6304114332. Finished in 77 steps.
[6461] reward: 100.00, reward 100-step MA: 1.94, action: [-0.04223126], td-error: 99.8767:  81%|████████  | 6462/8000 [09:46<02:33, 10.04it/s]389] reward: -0.08, reward 100-step MA: 1.93, action: [-0.91193974], td-error: 0.0000:  80%|███████▉  | 6390/8000 [09:39<02:33, 10.49it/s]
Total episode reward: 94.9238486185. Finished in 77 steps.
[6545] reward: -0.12, reward 100-step MA: 1.94, action: [-1.09871626], td-error: 0.0357:  82%|████████▏ | 6546/8000 [09:54<02:19, 10.44it/s]  [6465] reward: -0.10, reward 100-step MA: 1.94, action: [-1.0034076], td-error: 0.0325:  81%|████████  | 6466/8000 [09:46<02:27, 10.37it/s] 
Total episode reward: 95.0166638492. Finished in 83 steps.
[6625] reward: -0.06, reward 100-step MA: 1.94, action: [-0.77199543], td-error: 0.5316:  83%|████████▎ | 6626/8000 [10:01<02:10, 10.49it/s][6549] reward: -0.08, reward 100-step MA: 1.94, action: [-0.89217454], td-error: 0.1115:  82%|████████▏ | 6550/8000 [09:54<02:18, 10.46it/s]
Total episode reward: 94.5468855464. Finished in 80 steps.
[6699] reward: 99.89, reward 100-step MA: 1.93, action: [-1.04121447], td-error: 99.7117:  84%|████████▍ | 6700/8000 [10:08<02:04, 10.45it/s]6629] reward: -0.11, reward 100-step MA: 1.94, action: [-1.04835248], td-error: 0.1109:  83%|████████▎ | 6630/8000 [10:02<02:09, 10.59it/s]
Total episode reward: 94.5327636079. Finished in 75 steps.
[6789] reward: 99.90, reward 100-step MA: 1.94, action: [ 1.00359285], td-error: 100.1130:  85%|████████▍ | 6790/8000 [10:17<01:56, 10.38it/s]6703] reward: -0.06, reward 100-step MA: 1.93, action: [-0.80564976], td-error: 0.0000:  84%|████████▍ | 6704/8000 [10:09<02:04, 10.41it/s]
Total episode reward: 94.0956922018. Finished in 90 steps.
[6869] reward: -0.09, reward 100-step MA: 1.94, action: [-0.96265513], td-error: 0.7995:  86%|████████▌ | 6870/8000 [10:25<01:47, 10.48it/s]  [6793] reward: -0.09, reward 100-step MA: 1.93, action: [-0.95715183], td-error: 0.0000:  85%|████████▍ | 6794/8000 [10:17<01:55, 10.40it/s]
Total episode reward: 95.2610286331. Finished in 79 steps.
[6947] reward: -0.10, reward 100-step MA: 1.93, action: [-0.97843295], td-error: 0.0000:  87%|████████▋ | 6948/8000 [10:32<01:42, 10.21it/s][6873] reward: -0.10, reward 100-step MA: 1.94, action: [-0.9856953], td-error: 0.4948:  86%|████████▌ | 6874/8000 [10:25<01:48, 10.38it/s] 
Total episode reward: 94.1285489207. Finished in 78 steps.
[7025] reward: 99.98, reward 100-step MA: 1.93, action: [ 0.42938101], td-error: 99.9245:  88%|████████▊ | 7026/8000 [10:40<01:35, 10.24it/s]6951] reward: -0.09, reward 100-step MA: 1.93, action: [-0.95984733], td-error: 0.0733:  87%|████████▋ | 6952/8000 [10:33<01:43, 10.14it/s]
Total episode reward: 94.5143749322. Finished in 79 steps.
[7099] reward: -0.04, reward 100-step MA: 1.93, action: [-0.63026351], td-error: 0.0000:  89%|████████▉ | 7100/8000 [10:47<01:25, 10.48it/s] [7029] reward: -0.07, reward 100-step MA: 1.93, action: [-0.82460988], td-error: 0.1434:  88%|████████▊ | 7030/8000 [10:40<01:33, 10.40it/s]
Total episode reward: 94.9412080657. Finished in 73 steps.
[7173] reward: 99.89, reward 100-step MA: 1.93, action: [-1.02583122], td-error: 99.8727:  90%|████████▉ | 7174/8000 [10:54<01:20, 10.32it/s]7103] reward: -0.10, reward 100-step MA: 1.93, action: [-1.00551629], td-error: 0.1975:  89%|████████▉ | 7104/8000 [10:47<01:27, 10.27it/s]
Total episode reward: 94.6832196613. Finished in 75 steps.
[7249] reward: 99.91, reward 100-step MA: 1.93, action: [-0.94779313], td-error: 99.8496:  91%|█████████ | 7250/8000 [11:02<01:12, 10.34it/s][7177] reward: -0.11, reward 100-step MA: 1.93, action: [-1.05738091], td-error: 0.1826:  90%|████████▉ | 7178/8000 [10:55<01:18, 10.51it/s]
Total episode reward: 94.5361680119. Finished in 76 steps.
[7327] reward: -0.08, reward 100-step MA: 1.93, action: [-0.91985613], td-error: 0.0000:  92%|█████████▏| 7328/8000 [11:09<01:04, 10.45it/s] [7253] reward: -0.11, reward 100-step MA: 1.93, action: [-1.03249884], td-error: 0.5564:  91%|█████████ | 7254/8000 [11:02<01:12, 10.31it/s]
Total episode reward: 94.2408981571. Finished in 77 steps.
[7409] reward: 99.96, reward 100-step MA: 1.93, action: [ 0.61620492], td-error: 99.8825:  93%|█████████▎| 7410/8000 [11:17<00:56, 10.37it/s]7331] reward: -0.09, reward 100-step MA: 1.93, action: [-0.97202724], td-error: 0.0000:  92%|█████████▏| 7332/8000 [11:09<01:04, 10.29it/s]
Total episode reward: 94.5204475236. Finished in 83 steps.
[7501] reward: -0.09, reward 100-step MA: 1.95, action: [-0.94281501], td-error: 0.0000:  94%|█████████▍| 7502/8000 [11:26<00:49, 10.01it/s] [7413] reward: -0.08, reward 100-step MA: 1.94, action: [-0.87150162], td-error: 0.0000:  93%|█████████▎| 7414/8000 [11:17<00:57, 10.23it/s]
Total episode reward: 94.8214472012. Finished in 91 steps.
[7576] reward: -0.14, reward 100-step MA: 1.93, action: [-1.17773247], td-error: 0.0000:  95%|█████████▍| 7577/8000 [11:33<00:41, 10.25it/s][7505] reward: -0.08, reward 100-step MA: 1.94, action: [-0.90554327], td-error: 0.2872:  94%|█████████▍| 7506/8000 [11:26<00:48, 10.12it/s]
Total episode reward: 94.5517668356. Finished in 75 steps.
[7656] reward: 99.96, reward 100-step MA: 1.93, action: [ 0.62307554], td-error: 99.9433:  96%|█████████▌| 7657/8000 [11:41<00:33, 10.17it/s]7580] reward: -0.13, reward 100-step MA: 1.93, action: [-1.13226557], td-error: 0.0392:  95%|█████████▍| 7581/8000 [11:34<00:40, 10.26it/s]
Total episode reward: 94.608751134. Finished in 81 steps.
[7736] reward: 99.98, reward 100-step MA: 1.94, action: [ 0.49290159], td-error: 99.9680:  97%|█████████▋| 7737/8000 [11:49<00:25, 10.44it/s][7660] reward: -0.11, reward 100-step MA: 1.93, action: [-1.03042662], td-error: 0.0613:  96%|█████████▌| 7661/8000 [11:42<00:33, 10.21it/s]
Total episode reward: 94.1926933433. Finished in 80 steps.
[7814] reward: -0.05, reward 100-step MA: 1.94, action: [-0.73354656], td-error: 0.0000:  98%|█████████▊| 7815/8000 [11:57<00:18, 10.16it/s] [7740] reward: -0.10, reward 100-step MA: 1.93, action: [-0.9869296], td-error: 0.0000:  97%|█████████▋| 7741/8000 [11:49<00:24, 10.49it/s] 
Total episode reward: 95.0124570819. Finished in 77 steps.
[7888] reward: -0.11, reward 100-step MA: 1.94, action: [-1.03100061], td-error: 0.1384:  99%|█████████▊| 7889/8000 [12:04<00:10, 10.14it/s][7818] reward: -0.07, reward 100-step MA: 1.94, action: [-0.83864552], td-error: 0.0000:  98%|█████████▊| 7819/8000 [11:57<00:17, 10.46it/s]
Total episode reward: 95.0429873847. Finished in 74 steps.
[7964] reward: -0.14, reward 100-step MA: 1.93, action: [-1.19227219], td-error: 0.7090: 100%|█████████▉| 7965/8000 [12:11<00:03, 10.25it/s][7892] reward: -0.10, reward 100-step MA: 1.94, action: [-1.01194441], td-error: 0.2942:  99%|█████████▊| 7893/8000 [12:04<00:10, 10.28it/s]
Total episode reward: 94.4267093404. Finished in 76 steps.
[7999] reward: -0.02, reward 100-step MA: 0.92, action: [-0.4674266], td-error: 0.0000: 100%|██████████| 8000/8000 [12:15<00:00, 10.88it/s] [7968] reward: -0.11, reward 100-step MA: 1.93, action: [-1.04532933], td-error: 0.3538: 100%|█████████▉| 7969/8000 [12:12<00:03, 10.10it/s]

In [14]:
exp.save()
print("Experiment results saved in " + exp.path)


Experiment results saved in experiments/experiment_ddpg_MountainCarContinuous-v0_1495409459

In [15]:
exp.plot_cumulative_reward()


Out[15]:
[<matplotlib.lines.Line2D at 0x7fe0441bda10>]

In [16]:
exp.plot_reward()


Out[16]:
[<matplotlib.lines.Line2D at 0x7fe04416f690>]

In [17]:
exp.plot_td_error()


Out[17]:
[<matplotlib.lines.Line2D at 0x7fe003816850>]

In [18]:
exp.plot_episode_reward()


Out[18]:
[<matplotlib.lines.Line2D at 0x7fdffee8ba90>]

In [19]:
exp.plot_episode_duration()


Out[19]:
[<matplotlib.lines.Line2D at 0x7fdffedbf510>]

In [21]:
if settings["render_environment"]:
    exp.display_frames_as_gif()




Once Loop Reflect

In [ ]:


In [ ]:
session.close()