In [1]:
import gym
import tensorflow as tf
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


Using TensorFlow backend.

In [3]:
env = gym.make('BipedalWalker-v2')


[2017-09-10 22:12:24,230] Making new env: BipedalWalker-v2

In [4]:
# n_actions = env.action_space.n
# n_states = env.observation_space.shape
print(env.action_space.sample)
print(env.observation_space.sample)
print(env.observation_space.high)
print(env.observation_space.low)


<bound method Box.sample of Box(4,)>
<bound method Box.sample of Box(24,)>
[ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf  inf  inf  inf  inf  inf
  inf  inf  inf  inf  inf  inf  inf  inf  inf]
[-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf]

In [11]:
s = env.reset()
for i in range(10000):
    new_s, reward, done, _ = env.step(env.action_space.sample())
    env.render()
    if done:
        break
#print(i)
env.close()

In [20]:
agent = Sequential()
agent.add(Dense(32, input_dim=24, activation='relu'))
agent.add(Dense(32, activation='relu'))
agent.add(Dense(4, activation='tanh'))
agent.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [21]:
s = env.reset()
print(agent.predict(s.reshape(1, 24))[0])


[ 0.76757276  0.67181492 -0.34039173 -0.02565407]

In [22]:
t_max = 1000
def generate_sample():
    s = env.reset()
    batch_s = []
    batch_a = []
    total_reward = 0
    
    for i in range(t_max):
        a = agent.predict(s.reshape(1, 24))[0]
        new_s, r, done, _ = env.step(a)
        batch_s.append(s)
        batch_a.append(a)
        s = new_s
        total_reward = total_reward + r
        if done:
            break
    env.close()
    return batch_s, batch_a, total_reward

In [23]:
iterations = 100
percentile = 70
samples = 250

for i in range(iterations):
    population = [generate_sample() for i in range(samples)]
    batch_states,batch_actions,batch_rewards = map(np.array,zip(*population))
    threshold = np.percentile(batch_rewards, percentile)
    elite_states = batch_states[batch_rewards > threshold]
    elite_actions = batch_actions[batch_rewards > threshold]
    elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
    agent.fit(epochs=1, x=elite_states, y=elite_actions)
    print('Iteration: {0}, Mean Reward: {1:.2f}, Threshold: {2:.2f}'.format(i + 1, np.mean(batch_rewards), threshold))


Epoch 1/1
75000/75000 [==============================] - 8s - loss: 4.3289e-06 - acc: 1.0000       
Iteration: 1, Mean Reward: -36.34, Threshold: -35.02
Epoch 1/1
75000/75000 [==============================] - 8s - loss: 3.6576e-06 - acc: 1.0000      
Iteration: 2, Mean Reward: -35.61, Threshold: -31.31
Epoch 1/1
75000/75000 [==============================] - 8s - loss: 2.2019e-06 - acc: 1.0000      
Iteration: 3, Mean Reward: -38.41, Threshold: -36.46
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-23-4079dcb3ab8a> in <module>()
      4 
      5 for i in range(iterations):
----> 6     population = [generate_sample() for i in range(samples)]
      7     batch_states,batch_actions,batch_rewards = map(np.array,zip(*population))
      8     threshold = np.percentile(batch_rewards, percentile)

<ipython-input-23-4079dcb3ab8a> in <listcomp>(.0)
      4 
      5 for i in range(iterations):
----> 6     population = [generate_sample() for i in range(samples)]
      7     batch_states,batch_actions,batch_rewards = map(np.array,zip(*population))
      8     threshold = np.percentile(batch_rewards, percentile)

<ipython-input-22-6820df146c39> in generate_sample()
      7 
      8     for i in range(t_max):
----> 9         a = agent.predict(s.reshape(1, 24))[0]
     10         new_s, r, done, _ = env.step(a)
     11         batch_s.append(s)

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\keras\models.py in predict(self, x, batch_size, verbose)
    914         if self.model is None:
    915             self.build()
--> 916         return self.model.predict(x, batch_size=batch_size, verbose=verbose)
    917 
    918     def predict_on_batch(self, x):

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\keras\engine\training.py in predict(self, x, batch_size, verbose)
   1502         f = self.predict_function
   1503         return self._predict_loop(f, ins,
-> 1504                                   batch_size=batch_size, verbose=verbose)
   1505 
   1506     def train_on_batch(self, x, y,

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\keras\engine\training.py in _predict_loop(self, f, ins, batch_size, verbose)
   1126                 ins_batch = _slice_arrays(ins, batch_ids)
   1127 
-> 1128             batch_outs = f(ins_batch)
   1129             if not isinstance(batch_outs, list):
   1130                 batch_outs = [batch_outs]

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\keras\backend\tensorflow_backend.py in __call__(self, inputs)
   2265         updated = session.run(self.outputs + [self.updates_op],
   2266                               feed_dict=feed_dict,
-> 2267                               **self.session_kwargs)
   2268         return updated[:len(self.outputs)]
   2269 

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    787     try:
    788       result = self._run(None, fetches, feed_dict, options_ptr,
--> 789                          run_metadata_ptr)
    790       if run_metadata:
    791         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    995     if final_fetches or final_targets:
    996       results = self._do_run(handle, final_targets, final_fetches,
--> 997                              feed_dict_string, options, run_metadata)
    998     else:
    999       results = []

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1130     if handle is None:
   1131       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132                            target_list, options, run_metadata)
   1133     else:
   1134       return self._do_call(_prun_fn, self._session, handle, feed_dict,

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1137   def _do_call(self, fn, *args):
   1138     try:
-> 1139       return fn(*args)
   1140     except errors.OpError as e:
   1141       message = compat.as_text(e.message)

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1119         return tf_session.TF_Run(session, options,
   1120                                  feed_dict, fetch_list, target_list,
-> 1121                                  status, run_metadata)
   1122 
   1123     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [28]:
s = env.reset()
t_max = 1000
for i in range(t_max):
    env.render()
    a = agent.predict(s.reshape(1, 24))[0]
    new_s, r, done, _ = env.step(a)
    s = new_s
    if done:
        break
env.close()