Drive-it DQN

Model

\begin{equation} l_1 = relu( x W_1 + b_1) \\ l_2 = relu( x W_2 + b_2) \\ l_3 = relu( x W_3 + b_3) \\ Q(s,a) = l_1 W_o + b_o \\ \end{equation}

In [1]:
import numpy as np
import random, math
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')
%matplotlib inline
sns.set()

Visualization

We use PCA decomposition on memory samples to visualize the $Q(s,a)$ values across the state space:


In [2]:
def pca_plot(n=10000, alpha=1.0, size=5):
    _, samples = agent.memory.sample(n)
    states = np.array([ o[0] for o in samples ], dtype=np.float32)
    qsa = agent.brain.predict(states)[0]
    
    from sklearn import decomposition
    pca = decomposition.PCA(n_components=2)
    pca.fit(states)
    X = pca.transform(states)
    
    fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
    ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
    plt.scatter(X[:, 0], X[:, 1], c=np.argmax(qsa, 1), alpha=alpha, s=size, cmap="rainbow")
    ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,0], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,1], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,2], alpha=alpha, s=size, cmap="rainbow")

For a more meaningful plot, just project sample across $x_m$ and $y_m$.


In [3]:
def slice_plot(n=10000, alpha=1.0, size=5):
    _, samples = agent.memory.sample(n)
    states = np.array([ o[0] for o in samples ], dtype=np.float32)
    qsa = agent.brain.predict(states)
    v = agent.brain.value.eval(states)
    
    fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
    ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
    plt.scatter(states[:, 0], states[:, 1], c=np.argmax(qsa[:,0], 1), alpha=alpha, s=size, cmap="rainbow")
    ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,0,0], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,0,1], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,0,2], alpha=alpha, s=size, cmap="rainbow")
    axes[0,0].set_ylabel('$y_m$')
    axes[1,0].set_ylabel('$y_m$')
    axes[1,0].set_xlabel('$x_m$')
    axes[1,1].set_xlabel('$x_m$')

Training


In [4]:
def run_episode(agent, render=False):
    o = env.reset()
    b = belief.reset(o[car])
    s = belief.normalize(b)
    R = 0
    actions = {}
    while True:            
        if render: env.render()

        a = agent.act(s.astype(np.float32))
        actions[car] = a
        o_, r, done, info = env.step(actions)
        b_ = belief.update(o_[car], env.dt)
        s_ = belief.normalize(b_)
        
        if done:
            s_ = None

        agent.observe((s, a, r[car], s_))
        s = s_
        R += r[car]

        if done:
            agent.endEpisode()
            return R

In [5]:
from DriveItMultiGym import DriveItEnv
from car import Car
from belief import PositionTracking
from agent import Agent

BATCH_SIZE = 20000

car = Car()
env = DriveItEnv([car], time_limit=10.0, noisy=False)
belief = PositionTracking(car)
stateCnt  = belief.observation_space.shape[0]
actionCnt = env.action_space.n
agent = Agent(stateCnt, actionCnt)

episode_number = 0
last_batch_episode = 0
last_batch_steps = 0
episodes = []
reward_sum = 0
reward_best = 18.0

(stateCnt, actionCnt)


Out[5]:
(5, 9)

In [ ]:
while episode_number < 50000 and agent.steps < 190000:
    episode_number += 1
    reward = run_episode(agent, render=False)
    reward_sum += reward
    
    if agent.steps >= last_batch_steps + BATCH_SIZE:
        reward_avg = reward_sum / (episode_number - last_batch_episode)
        last_batch_episode = episode_number
        last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
        episodes.append((episode_number, agent.steps, reward_avg))

        print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
              % (episode_number, agent.steps, agent.epsilon, reward_avg))
        
        if reward_avg > reward_best:
            reward_best = reward_avg
            agent.brain.model.save_model('best.mod')

        reward_sum = 0
       
agent.brain.model.save_model('last.mod')
print('Done.')


Episode: 390, steps: 20019, epsilon: 0.972763, average reward: -0.304668.
Episode: 770, steps: 40027, epsilon: 0.946365, average reward: -0.335751.

In [ ]:
from numpy import cos, sin, pi
median_radius = 0.375
line_length = 2.0 * median_radius
loop_median_length = 3.0 / 2.0 * pi * median_radius
checkpoint_median_length = line_length + loop_median_length

_, samples = agent.memory.sample(50000)
states = np.array([ o[0] for o in samples ], dtype=np.float32)
qmin = np.array([ o[-1] for o in samples ], dtype=np.float32)
qsa = agent.brain.predict(states)[0]
v = agent.brain.value.eval(states)[0]
a = agent.brain.advantage.eval(states)[0]

In [18]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')


Out[18]:
<matplotlib.text.Text at 0x6da59518>

In [16]:
plt.scatter(states[:, 0], states[:, 1], c=v[:,0], alpha=0.5, s=5, cmap="rainbow")


Out[16]:
<matplotlib.collections.PathCollection at 0x6cf3f080>

In [16]:
plt.scatter(states[:, 0], states[:, 1], c=a[:,0], alpha=0.5, s=5, cmap="rainbow")


Out[16]:
<matplotlib.collections.PathCollection at 0x6cf3f080>

In [17]:
plt.scatter(states[:, 0], states[:, 1], c=qmin, alpha=0.5, s=5, cmap="rainbow")


Out[17]:
<matplotlib.collections.PathCollection at 0x6ce27d30>

In [26]:
coord = [env.map.median_to_cartesian(s[0]*checkpoint_median_length,s[1]*0.225) \
         for s in states]
x = [c[0] for c in coord]
y = [c[1] for c in coord]

In [28]:
plt.scatter(x, y, c=v[:,0], alpha=0.5, s=5, cmap="rainbow")


Out[28]:
<matplotlib.collections.PathCollection at 0x1ae9dd30>

In [29]:
plt.scatter(states[:, 6], states[:, 7], c=v[:,0], alpha=0.5, s=5, cmap="rainbow")


Out[29]:
<matplotlib.collections.PathCollection at 0x45350470>

In [31]:
plt.scatter(states[:, 0], states[:, 1], c=np.argmax(qsa[:,0], 1), alpha=0.5, s=5, cmap="rainbow")


Out[31]:
<matplotlib.collections.PathCollection at 0x5f822668>

In [45]:
for i in range(9):
    print(i)
    plt.scatter(x, y, c=a[:,0,i], alpha=0.5, s=5, cmap="rainbow")
    plt.show()


0
1
2
3
4
5
6
7
8

In [23]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')


Out[23]:
<matplotlib.text.Text at 0x4f17b710>

In [7]:
slice_plot(n=20000, size=5, alpha=0.5)
#plt.savefig('qslice.png', dpi=300)



In [ ]:



In [9]:
while episode_number < 30000 and agent.steps < 2980000:
    episode_number += 1
    reward = run_episode(agent, render=False)
    reward_sum += reward
    
    if agent.steps >= last_batch_steps + BATCH_SIZE:
        reward_avg = reward_sum / (episode_number - last_batch_episode)
        last_batch_episode = episode_number
        last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
        episodes.append((episode_number, agent.steps, reward_avg))

        print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
              % (episode_number, agent.steps, agent.epsilon, reward_avg))
        
        if reward_avg > reward_best:
            reward_best = reward_avg
            agent.brain.model.save_model('best.mod', False)

        reward_sum = 0
       
agent.brain.model.save_model('last.mod', False)
print('Done.')


Episode: 806, steps: 20012, epsilon: 0.964170, average reward: -1.314449.
Episode: 1568, steps: 40005, epsilon: 0.929668, average reward: -1.267090.
Episode: 2316, steps: 60007, epsilon: 0.896400, average reward: -1.244711.
Episode: 3040, steps: 80006, epsilon: 0.864339, average reward: -1.227046.
Episode: 3743, steps: 100024, epsilon: 0.833410, average reward: -1.192490.
Episode: 4381, steps: 120006, epsilon: 0.803653, average reward: -1.094363.
Episode: 5019, steps: 140019, epsilon: 0.774927, average reward: -1.096916.
Episode: 5626, steps: 160011, epsilon: 0.747270, average reward: -1.045155.
Episode: 6252, steps: 180022, epsilon: 0.720588, average reward: -1.066209.
Episode: 6861, steps: 200021, epsilon: 0.694886, average reward: -1.040312.
Episode: 7435, steps: 220058, epsilon: 0.670068, average reward: -0.981508.
Episode: 7956, steps: 240078, epsilon: 0.646169, average reward: -0.860671.
Episode: 8452, steps: 260060, epsilon: 0.623179, average reward: -0.797081.
Episode: 8938, steps: 280012, epsilon: 0.601052, average reward: -0.773937.
Episode: 9422, steps: 300011, epsilon: 0.579674, average reward: -0.749323.
Episode: 9896, steps: 320020, epsilon: 0.559059, average reward: -0.733105.
Episode: 10341, steps: 340046, epsilon: 0.539174, average reward: -0.630904.
Episode: 10776, steps: 360038, epsilon: 0.520041, average reward: -0.615549.
Episode: 11182, steps: 380024, epsilon: 0.501605, average reward: -0.499242.
Episode: 11568, steps: 400020, epsilon: 0.483826, average reward: -0.393527.
Episode: 11928, steps: 420049, epsilon: 0.466663, average reward: -0.274655.
Episode: 12286, steps: 440055, epsilon: 0.450140, average reward: -0.281907.
Episode: 12622, steps: 460048, epsilon: 0.434226, average reward: -0.133299.
Episode: 12945, steps: 480031, epsilon: 0.418894, average reward: -0.087263.
Episode: 13255, steps: 500120, epsilon: 0.404039, average reward: 0.006544.
Episode: 13520, steps: 520101, epsilon: 0.389800, average reward: 0.391778.
Episode: 13776, steps: 540037, epsilon: 0.376105, average reward: 0.473756.
Episode: 14019, steps: 560011, epsilon: 0.362879, average reward: 0.636351.
Episode: 14234, steps: 580016, epsilon: 0.350112, average reward: 0.975623.
Episode: 14452, steps: 600022, epsilon: 0.337807, average reward: 0.954006.
Episode: 14654, steps: 620064, epsilon: 0.325925, average reward: 1.251441.
Episode: 14845, steps: 640087, epsilon: 0.314485, average reward: 1.421935.
Episode: 15027, steps: 660045, epsilon: 0.303494, average reward: 1.566506.
Episode: 15210, steps: 680033, epsilon: 0.292884, average reward: 1.542400.
Episode: 15374, steps: 700256, epsilon: 0.282540, average reward: 2.052173.
Episode: 15517, steps: 720029, epsilon: 0.272792, average reward: 2.586585.
Episode: 15672, steps: 740073, epsilon: 0.263266, average reward: 2.282690.
Episode: 15797, steps: 760040, epsilon: 0.254120, average reward: 3.382139.
Episode: 15923, steps: 780111, epsilon: 0.245259, average reward: 3.388498.
Episode: 16053, steps: 800055, epsilon: 0.236773, average reward: 3.166681.
Episode: 16162, steps: 820040, epsilon: 0.228576, average reward: 4.191718.
Episode: 16284, steps: 840054, epsilon: 0.220665, average reward: 3.485517.
Episode: 16404, steps: 860085, epsilon: 0.213033, average reward: 3.547644.
Episode: 16501, steps: 880105, epsilon: 0.205682, average reward: 4.981430.
Episode: 16614, steps: 900052, epsilon: 0.198623, average reward: 3.881476.
Episode: 16704, steps: 920023, epsilon: 0.191810, average reward: 5.576601.
Episode: 16799, steps: 940390, epsilon: 0.185115, average reward: 5.215137.
Episode: 16887, steps: 960284, epsilon: 0.178814, average reward: 5.675337.
Episode: 16963, steps: 980489, epsilon: 0.172646, average reward: 7.176975.
Episode: 17026, steps: 1000490, epsilon: 0.166763, average reward: 9.227968.
Episode: 17085, steps: 1020379, epsilon: 0.161123, average reward: 9.916854.
Episode: 17146, steps: 1040091, epsilon: 0.155735, average reward: 9.408133.
Episode: 17211, steps: 1060313, epsilon: 0.150406, average reward: 8.969985.
Episode: 17270, steps: 1080478, epsilon: 0.145286, average reward: 9.953835.
Episode: 17328, steps: 1100024, epsilon: 0.140502, average reward: 10.276583.
Episode: 17379, steps: 1120422, epsilon: 0.135689, average reward: 12.669610.
Episode: 17430, steps: 1140314, epsilon: 0.131167, average reward: 12.270737.
Episode: 17482, steps: 1160085, epsilon: 0.126833, average reward: 11.671696.
Episode: 17535, steps: 1180014, epsilon: 0.122622, average reward: 11.975995.
Episode: 17581, steps: 1200383, epsilon: 0.118475, average reward: 14.133583.
Episode: 17626, steps: 1220071, epsilon: 0.114611, average reward: 14.210104.
Episode: 17670, steps: 1240216, epsilon: 0.110800, average reward: 15.101350.
Episode: 17711, steps: 1260307, epsilon: 0.107138, average reward: 16.540284.
Episode: 17751, steps: 1280553, epsilon: 0.103582, average reward: 17.141953.
Episode: 17793, steps: 1300380, epsilon: 0.100226, average reward: 15.744951.
Episode: 17830, steps: 1320375, epsilon: 0.096963, average reward: 18.829577.
Episode: 17867, steps: 1340306, epsilon: 0.093828, average reward: 18.602951.
Episode: 17903, steps: 1360409, epsilon: 0.090781, average reward: 19.541556.
Episode: 17942, steps: 1380253, epsilon: 0.087881, average reward: 17.475310.
Episode: 17983, steps: 1400095, epsilon: 0.085086, average reward: 16.419630.
Episode: 18019, steps: 1420422, epsilon: 0.082327, average reward: 20.030583.
Episode: 18055, steps: 1440499, epsilon: 0.079701, average reward: 19.754224.
Episode: 18092, steps: 1460318, epsilon: 0.077202, average reward: 18.923814.
Episode: 18128, steps: 1480265, epsilon: 0.074777, average reward: 19.340988.
Episode: 18165, steps: 1500297, epsilon: 0.072431, average reward: 19.053294.
Episode: 18199, steps: 1520225, epsilon: 0.070180, average reward: 20.868038.
Episode: 18234, steps: 1540020, epsilon: 0.068026, average reward: 19.991926.
Episode: 18271, steps: 1560569, epsilon: 0.065870, average reward: 19.540105.
Episode: 18305, steps: 1580557, epsilon: 0.063850, average reward: 21.160380.
Episode: 18339, steps: 1600015, epsilon: 0.061955, average reward: 20.350771.
Episode: 18373, steps: 1620415, epsilon: 0.060038, average reward: 21.605614.
Episode: 18409, steps: 1640582, epsilon: 0.058214, average reward: 19.655399.
Episode: 18444, steps: 1660067, epsilon: 0.056514, average reward: 19.807977.
Episode: 18478, steps: 1680290, epsilon: 0.054813, average reward: 21.169176.
Episode: 18512, steps: 1700283, epsilon: 0.053193, average reward: 21.019891.
Episode: 18547, steps: 1720473, epsilon: 0.051616, average reward: 20.724094.
Episode: 18583, steps: 1740471, epsilon: 0.050111, average reward: 19.759975.
Episode: 18616, steps: 1760271, epsilon: 0.048674, average reward: 21.368242.
Episode: 18651, steps: 1780228, epsilon: 0.047278, average reward: 20.304424.
Episode: 18686, steps: 1800406, epsilon: 0.045918, average reward: 20.739795.
Episode: 18722, steps: 1820562, epsilon: 0.044609, average reward: 19.962188.
Episode: 18755, steps: 1840160, epsilon: 0.043382, average reward: 21.648067.
Episode: 18789, steps: 1860060, epsilon: 0.042180, average reward: 21.196176.
Episode: 18823, steps: 1880460, epsilon: 0.040993, average reward: 21.870012.
Episode: 18857, steps: 1900471, epsilon: 0.039872, average reward: 20.845313.
Episode: 18891, steps: 1920341, epsilon: 0.038798, average reward: 20.792393.
Episode: 18925, steps: 1940172, epsilon: 0.037765, average reward: 20.612579.
Episode: 18959, steps: 1960572, epsilon: 0.036741, average reward: 21.402128.
Episode: 18994, steps: 1980475, epsilon: 0.035778, average reward: 20.463964.
Episode: 19027, steps: 2000275, epsilon: 0.034855, average reward: 21.461197.
Episode: 19060, steps: 2020075, epsilon: 0.033965, average reward: 21.361877.
Episode: 19094, steps: 2040475, epsilon: 0.033081, average reward: 21.201151.
Episode: 19129, steps: 2060466, epsilon: 0.032247, average reward: 19.997824.
Episode: 19162, steps: 2080266, epsilon: 0.031450, average reward: 21.078323.
Episode: 19195, steps: 2100066, epsilon: 0.030682, average reward: 21.623385.
Episode: 19229, steps: 2120466, epsilon: 0.029919, average reward: 21.369890.
Episode: 19262, steps: 2140266, epsilon: 0.029205, average reward: 21.502574.
Episode: 19295, steps: 2160066, epsilon: 0.028518, average reward: 21.541886.
Episode: 19329, steps: 2180466, epsilon: 0.027835, average reward: 21.110788.
Episode: 19362, steps: 2200266, epsilon: 0.027196, average reward: 21.301970.
Episode: 19396, steps: 2220137, epsilon: 0.026578, average reward: 20.318614.
Episode: 19430, steps: 2240537, epsilon: 0.025966, average reward: 21.972949.
Episode: 19463, steps: 2260337, epsilon: 0.025395, average reward: 21.822823.
Episode: 19497, steps: 2280207, epsilon: 0.024841, average reward: 21.130402.
Episode: 19532, steps: 2300078, epsilon: 0.024308, average reward: 19.986867.
Episode: 19566, steps: 2320478, epsilon: 0.023780, average reward: 21.988445.
Episode: 19599, steps: 2340278, epsilon: 0.023287, average reward: 21.976736.
Episode: 19633, steps: 2360095, epsilon: 0.022810, average reward: 21.372406.
Episode: 19667, steps: 2380495, epsilon: 0.022338, average reward: 21.668707.
Episode: 19700, steps: 2400295, epsilon: 0.021896, average reward: 20.524818.
Episode: 19733, steps: 2420095, epsilon: 0.021470, average reward: 21.343318.
Episode: 19767, steps: 2440495, epsilon: 0.021047, average reward: 21.967237.
Episode: 19801, steps: 2460442, epsilon: 0.020648, average reward: 21.211278.
Episode: 19834, steps: 2480242, epsilon: 0.020267, average reward: 20.513254.
Episode: 19868, steps: 2500113, epsilon: 0.019898, average reward: 21.083357.
Episode: 19903, steps: 2520583, epsilon: 0.019532, average reward: 20.038888.
Episode: 19937, steps: 2540582, epsilon: 0.019187, average reward: 21.413617.
Episode: 19972, steps: 2560533, epsilon: 0.018855, average reward: 19.901994.
Episode: 20006, steps: 2580436, epsilon: 0.018537, average reward: 21.236916.
Episode: 20039, steps: 2600236, epsilon: 0.018231, average reward: 20.967074.
Episode: 20073, steps: 2620053, epsilon: 0.017936, average reward: 21.078987.
Episode: 20108, steps: 2640524, epsilon: 0.017642, average reward: 20.687864.
Episode: 20141, steps: 2660324, epsilon: 0.017368, average reward: 20.861041.
Episode: 20174, steps: 2680124, epsilon: 0.017105, average reward: 21.678740.
Episode: 20208, steps: 2700524, epsilon: 0.016843, average reward: 21.549884.
Episode: 20241, steps: 2720324, epsilon: 0.016597, average reward: 21.531291.
Episode: 20276, steps: 2740289, epsilon: 0.016359, average reward: 19.984779.
Episode: 20309, steps: 2760089, epsilon: 0.016131, average reward: 20.775554.
Episode: 20344, steps: 2780586, epsilon: 0.015904, average reward: 20.928459.
Episode: 20377, steps: 2800258, epsilon: 0.015694, average reward: 20.921902.
Episode: 20411, steps: 2820130, epsilon: 0.015489, average reward: 20.441248.
Episode: 20445, steps: 2840530, epsilon: 0.015287, average reward: 21.524364.
Episode: 20478, steps: 2860330, epsilon: 0.015098, average reward: 21.496326.
Episode: 20512, steps: 2880229, epsilon: 0.014914, average reward: 20.891398.
Episode: 20545, steps: 2900029, epsilon: 0.014738, average reward: 21.723245.
Episode: 20579, steps: 2920429, epsilon: 0.014563, average reward: 21.604473.
Episode: 20612, steps: 2940229, epsilon: 0.014400, average reward: 21.203717.
Episode: 20645, steps: 2960029, epsilon: 0.014242, average reward: 21.506767.
Episode: 20679, steps: 2980429, epsilon: 0.014086, average reward: 20.863255.
Done.

In [10]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')
plt.savefig('learning.png', dpi=300)



In [11]:
slice_plot(n=20000, size=5, alpha=0.5)
plt.savefig('qslice.png', dpi=300)



In [12]:
pca_plot(n=20000, size=5, alpha=0.5)
plt.savefig('pca.png', dpi=300)


Exploration - exploitation trade-off

Note initiall $\epsilon$ is set to 1 which implies we are enitrely exploraing but as steps increase we reduce exploration and start leveraging the learnt space to collect rewards (a.k.a exploitation) as well.


In [12]:
def epsilon(steps):
    return MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-LAMBDA * steps)

r = range(0,EXPLORATION_STOP,int(EXPLORATION_STOP/100))
plt.plot(r, [min(epsilon(x),1) for x in r], 'r')
#plt.plot(r, [min(epsilon(x),1)**EPSILON_TRAIN_FACTOR for x in r], 'b')
plt.xlabel('step');plt.ylabel('$\epsilon$')


Out[12]:
<matplotlib.text.Text at 0x53bd19b0>

Discounted Reward

We tune $\gamma$ to look ahead only a short timespan, in which the current action is of significant importance.


In [15]:
r = range(0,600)
plt.plot([t/60.0 for t in r], [GAMMA ** x for x in r], 'r')
plt.xlabel('time [s]');plt.ylabel('discount')
GAMMA


Out[15]:
0.995

In [ ]: