Running the DQN model


In [1]:
import cntk as C
from DriveItMultiGym import *
from belief import *

car = Car()
env = DriveItEnv([car], time_limit=10.0, gamma=0.98, noisy=True)
belief = PositionTracking(car)
random_position=False
stateCnt  = belief.observation_space.shape[0]

def run_episode(model, render=False):
    o = env.reset()
    s = belief.reset(o[car])
    s = s / belief.observation_space.high
    R = 0
    actions = {}
    while True:            
        if render: env.render()

        a = np.argmax(model.eval(s.reshape(1, stateCnt).astype(np.float32)))
        actions[car] = a
        o_, r, done, info = env.step(actions)
        s_ = belief.update(o_[car], env.dt)
        s_ = s_ / belief.observation_space.high
        
        if done:
            s_ = None

        s = s_
        R += r[car]

        if done:
            return R, info
        

def run_model(path, num_episodes=1):
    model = C.load_model(path)
    reward_sum = 0
    reward_no_crash = 0
    for i_episode in range(1, num_episodes + 1):
        R, info = run_episode(model, num_episodes < 2)

        if num_episodes <= 10:
            print('Episode %d %s. reward: %f, laps: %f' % \
                  (i_episode, info['done'], R, R / lap_median_length))

        if info['done'] != 'complete':
            crashes += 1
        else:
            reward_no_crash += R
        reward_sum += R

    if num_episodes > 1:
        print('Average reward: %f with %d crashes (%f excl. crashes)' % \
              (reward_sum / num_episodes, crashes, \
               reward_no_crash / (num_episodes - crashes)))

In [2]:
modelPath = 'last.mod'

In [2]:
modelPath = 'D:/jc/OneDrive - manoli.net/Courses/CS229/Project/cntk/bigboss/best.mod'

In [18]:
run_model(modelPath, 100)


Average reward: 22.096251 with 0 crashes (22.096251 excl. crashes)

In [19]:
run_model(modelPath, 10)


Episode 1 complete. reward: 22.080785, laps: 4.386076
Episode 2 complete. reward: 22.218435, laps: 4.413418
Episode 3 complete. reward: 22.048827, laps: 4.379728
Episode 4 complete. reward: 22.020093, laps: 4.374020
Episode 5 complete. reward: 22.066715, laps: 4.383281
Episode 6 complete. reward: 22.117833, laps: 4.393435
Episode 7 complete. reward: 22.132000, laps: 4.396249
Episode 8 complete. reward: 22.077661, laps: 4.385455
Episode 9 complete. reward: 22.104040, laps: 4.390695
Episode 10 complete. reward: 22.278536, laps: 4.425357
Average reward: 22.114492 with 0 crashes (22.114492 excl. crashes)

In [3]:
run_model(modelPath, 1)


Episode 1 complete. reward: 0.906890, laps: 0.180143

In [4]:
env.close()

In [26]:
env._reset(random_position=False)


Out[26]:
array([ 0.00200037, -0.03441861,  0.        ,  0.        ])

In [13]:
env.render()

In [20]:
env.step(1)


Out[20]:
(array([ 0.17223266,  0.00231618,  0.2       ,  0.01960784]),
 0.04174948671969142,
 False,
 {'checkpoint': False, 'done': 'unknown', 'lap': False})

In [3]:
lr = [0.01, 0.005, 0.0025, 0.001]
lr_schedule = C.learning_rate_schedule(lr, C.UnitType.minibatch, epoch_size=5000)

In [12]:
lr_schedule[5000]


Out[12]:
0.005

In [15]:
#max penalty
1.0 * 2.5 / 60.0 / (1 - 0.98)


Out[15]:
2.0833333333333313

In [5]:
checkpoint_median_length


Out[5]:
2.517145867644259

In [ ]: