Drive-it DQN

Model

\begin{equation} l_1 = relu( x W_1 + b_1) \\ l_2 = relu( x W_2 + b_2) \\ l_3 = relu( x W_3 + b_3) \\ Q(s,a) = l_1 W_o + b_o \\ \end{equation}

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')
%matplotlib inline
sns.set()

Visualization

We use PCA decomposition on memory samples to visualize the $Q(s,a)$ values across the state space:


In [2]:
def pca_plot(n=10000, alpha=1.0, size=5):
    _, samples = agent.memory.sample(n)
    states = np.array([ o[0] for o in samples ], dtype=np.float32)
    qsa = agent.brain.predict(states)[0]
    
    from sklearn import decomposition
    pca = decomposition.PCA(n_components=2)
    pca.fit(states)
    X = pca.transform(states)
    
    fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
    ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
    plt.scatter(X[:, 0], X[:, 1], c=np.argmax(qsa, 1), alpha=alpha, s=size, cmap="rainbow")
    ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,0], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,1], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
    plt.scatter(X[:, 0], X[:, 1], c=qsa[:,2], alpha=alpha, s=size, cmap="rainbow")

For a more meaningful plot, just project sample across $x_m$ and $y_m$.


In [3]:
def slice_plot(n=10000, alpha=1.0, size=5):
    _, samples = agent.memory.sample(n)
    states = np.array([ o[0] for o in samples ], dtype=np.float32)
    qsa = agent.brain.predict(states)[0]
    
    fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
    ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
    plt.scatter(states[:, 0], states[:, 1], c=np.argmax(qsa, 1), alpha=alpha, s=size, cmap="rainbow")
    ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,0], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,1], alpha=alpha, s=size, cmap="rainbow")    
    ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
    plt.scatter(states[:, 0], states[:, 1], c=qsa[:,2], alpha=alpha, s=size, cmap="rainbow")
    axes[0,0].set_ylabel('$y_m$')
    axes[1,0].set_ylabel('$y_m$')
    axes[1,0].set_xlabel('$x_m$')
    axes[1,1].set_xlabel('$x_m$')

Training


In [4]:
def run_episode(agent, render=False):
    s = env.reset()
    R = 0 
    while True:            
        if render: env.render()

        a = agent.act(s.astype(np.float32))
        s_, r, done, info = env.step(a)
        
        if done:
            s_ = None

        agent.observe((s, a, r, s_))
        s = s_
        R += r

        if done:
            agent.endEpisode()
            return R

In [5]:
from DriveItGym import DriveItEnv
from agent import Agent

BATCH_SIZE = 20000

env = DriveItEnv(time_limit=10.0, throttle_limit=1.0)
stateCnt  = env.observation_space.shape[0]
actionCnt = env.action_space.n
agent = Agent(stateCnt, actionCnt)

episode_number = 0
last_batch_episode = 0
last_batch_steps = 0
episodes = []
reward_sum = 0
reward_best = 18.0

(stateCnt, actionCnt)


Out[5]:
(5, 9)

In [6]:
while episode_number < 50000 and agent.steps < 1980000:
    episode_number += 1
    reward = run_episode(agent, render=False)
    reward_sum += reward
    
    if agent.steps >= last_batch_steps + BATCH_SIZE:
        reward_avg = reward_sum / (episode_number - last_batch_episode)
        last_batch_episode = episode_number
        last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
        episodes.append((episode_number, agent.steps, reward_avg))

        print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
              % (episode_number, agent.steps, agent.epsilon, reward_avg))
        
        if reward_avg > reward_best:
            reward_best = reward_avg
            agent.brain.model.save_model('best.mod')

        reward_sum = 0
       
agent.brain.model.save_model('last.mod')
print('Done.')


Episode: 718, steps: 20035, epsilon: 0.972742, average reward: -1.224206.
Episode: 1457, steps: 40017, epsilon: 0.946378, average reward: -1.251165.
Episode: 2182, steps: 60001, epsilon: 0.920808, average reward: -1.238136.
Episode: 2875, steps: 80012, epsilon: 0.895978, average reward: -1.200988.
Episode: 3548, steps: 100006, epsilon: 0.871919, average reward: -1.157299.
Episode: 4207, steps: 120032, epsilon: 0.848551, average reward: -1.132718.
Episode: 4819, steps: 140004, epsilon: 0.825950, average reward: -1.058965.
Episode: 5430, steps: 160005, epsilon: 0.804000, average reward: -1.062778.
Episode: 6037, steps: 180017, epsilon: 0.782702, average reward: -1.038461.
Episode: 6611, steps: 200012, epsilon: 0.762066, average reward: -0.992688.
Episode: 7181, steps: 220008, epsilon: 0.742052, average reward: -0.982236.
Episode: 7723, steps: 240044, epsilon: 0.722606, average reward: -0.923957.
Episode: 8263, steps: 260031, epsilon: 0.703794, average reward: -0.911821.
Episode: 8786, steps: 280017, epsilon: 0.685551, average reward: -0.870779.
Episode: 9298, steps: 300008, epsilon: 0.667855, average reward: -0.836974.
Episode: 9776, steps: 320036, epsilon: 0.650662, average reward: -0.758708.
Episode: 10249, steps: 340041, epsilon: 0.634009, average reward: -0.732370.
Episode: 10721, steps: 360090, epsilon: 0.617824, average reward: -0.736512.
Episode: 11176, steps: 380019, epsilon: 0.602223, average reward: -0.690797.
Episode: 11629, steps: 400128, epsilon: 0.586957, average reward: -0.666337.
Episode: 12073, steps: 420002, epsilon: 0.572325, average reward: -0.667183.
Episode: 12535, steps: 440022, epsilon: 0.558031, average reward: -0.720751.
Episode: 12967, steps: 460056, epsilon: 0.544159, average reward: -0.608047.
Episode: 13385, steps: 480028, epsilon: 0.530749, average reward: -0.556201.
Episode: 13789, steps: 500035, epsilon: 0.517721, average reward: -0.506930.
Episode: 14182, steps: 520081, epsilon: 0.505062, average reward: -0.470994.
Episode: 14557, steps: 540103, epsilon: 0.492802, average reward: -0.381951.
Episode: 14934, steps: 560001, epsilon: 0.480986, average reward: -0.387621.
Episode: 15288, steps: 580042, epsilon: 0.469443, average reward: -0.291349.
Episode: 15633, steps: 600007, epsilon: 0.458293, average reward: -0.263519.
Episode: 15964, steps: 620091, epsilon: 0.447415, average reward: -0.179316.
Episode: 16292, steps: 640007, epsilon: 0.436954, average reward: -0.146795.
Episode: 16626, steps: 660033, epsilon: 0.426754, average reward: -0.180636.
Episode: 16913, steps: 680091, epsilon: 0.416846, average reward: 0.155630.
Episode: 17209, steps: 700074, epsilon: 0.407274, average reward: 0.099739.
Episode: 17485, steps: 720044, epsilon: 0.397998, average reward: 0.239702.
Episode: 17754, steps: 740035, epsilon: 0.388992, average reward: 0.312114.
Episode: 18034, steps: 760041, epsilon: 0.380252, average reward: 0.178870.
Episode: 18298, steps: 780067, epsilon: 0.371768, average reward: 0.327785.
Episode: 18584, steps: 800076, epsilon: 0.363547, average reward: 0.176827.
Episode: 18863, steps: 820172, epsilon: 0.355541, average reward: 0.240584.
Episode: 19106, steps: 840011, epsilon: 0.347876, average reward: 0.524671.
Episode: 19294, steps: 860118, epsilon: 0.340342, average reward: 1.418729.
Episode: 19482, steps: 880082, epsilon: 0.333089, average reward: 1.440568.
Episode: 19653, steps: 900133, epsilon: 0.326024, average reward: 1.825130.
Episode: 19863, steps: 920022, epsilon: 0.319227, average reward: 0.954244.
Episode: 20020, steps: 940351, epsilon: 0.312492, average reward: 2.220774.
Episode: 20201, steps: 960042, epsilon: 0.306165, average reward: 1.462366.
Episode: 20390, steps: 980141, epsilon: 0.299901, average reward: 1.349605.
Episode: 20528, steps: 1000004, epsilon: 0.293898, average reward: 2.602040.
Episode: 20682, steps: 1020065, epsilon: 0.288018, average reward: 2.161909.
Episode: 20833, steps: 1040119, epsilon: 0.282318, average reward: 2.350221.
Episode: 20946, steps: 1060230, epsilon: 0.276776, average reward: 3.956100.
Episode: 21120, steps: 1080008, epsilon: 0.271489, average reward: 1.701919.
Episode: 21264, steps: 1100103, epsilon: 0.266280, average reward: 2.632423.
Episode: 21414, steps: 1120009, epsilon: 0.261276, average reward: 2.354048.
Episode: 21550, steps: 1140047, epsilon: 0.256391, average reward: 2.765858.
Episode: 21665, steps: 1160104, epsilon: 0.251649, average reward: 3.782466.
Episode: 21789, steps: 1180039, epsilon: 0.247079, average reward: 3.386539.
Episode: 21866, steps: 1200042, epsilon: 0.242631, average reward: 7.136366.
Episode: 21944, steps: 1220501, epsilon: 0.238221, average reward: 6.887497.
Episode: 22019, steps: 1240069, epsilon: 0.234131, average reward: 6.895148.
Episode: 22091, steps: 1260590, epsilon: 0.229972, average reward: 7.997184.
Episode: 22161, steps: 1280460, epsilon: 0.226067, average reward: 7.776642.
Episode: 22237, steps: 1300250, epsilon: 0.222295, average reward: 7.200248.
Episode: 22295, steps: 1320142, epsilon: 0.218617, average reward: 10.295665.
Episode: 22361, steps: 1340565, epsilon: 0.214956, average reward: 8.899904.
Episode: 22442, steps: 1360096, epsilon: 0.211561, average reward: 6.417967.
Episode: 22517, steps: 1380060, epsilon: 0.208194, average reward: 7.474270.
Episode: 22586, steps: 1400012, epsilon: 0.204930, average reward: 8.183188.
Episode: 22643, steps: 1420371, epsilon: 0.201702, average reward: 10.896847.
Episode: 22700, steps: 1440572, epsilon: 0.198596, average reward: 10.729821.
Episode: 22745, steps: 1460418, epsilon: 0.195638, average reward: 14.358912.
Episode: 22800, steps: 1480309, epsilon: 0.192762, average reward: 11.245888.
Episode: 22852, steps: 1500002, epsilon: 0.190000, average reward: 11.856634.
Episode: 22921, steps: 1520024, epsilon: 0.187276, average reward: 8.477440.
Episode: 23064, steps: 1540195, epsilon: 0.184615, average reward: 2.860961.
Episode: 23185, steps: 1560474, epsilon: 0.182021, average reward: 3.688232.
Episode: 23246, steps: 1580136, epsilon: 0.179583, average reward: 9.714348.
Episode: 23329, steps: 1600387, epsilon: 0.177147, average reward: 6.449905.
Episode: 23393, steps: 1620020, epsilon: 0.174856, average reward: 8.797979.
Episode: 23461, steps: 1640030, epsilon: 0.172592, average reward: 8.293289.
Episode: 23532, steps: 1660199, epsilon: 0.170379, average reward: 7.849850.
Episode: 23717, steps: 1680041, epsilon: 0.168268, average reward: 1.562792.
Episode: 23901, steps: 1700068, epsilon: 0.166201, average reward: 1.509919.
Episode: 24028, steps: 1720311, epsilon: 0.164175, average reward: 3.394587.
Episode: 24171, steps: 1740097, epsilon: 0.162256, average reward: 2.581383.
Episode: 24349, steps: 1760098, epsilon: 0.160373, average reward: 1.620501.
Episode: 24453, steps: 1780394, epsilon: 0.158521, average reward: 4.655454.
Episode: 24530, steps: 1800096, epsilon: 0.156778, average reward: 6.993054.
Episode: 24574, steps: 1820009, epsilon: 0.155068, average reward: 14.689843.
Episode: 24617, steps: 1840490, epsilon: 0.153364, average reward: 15.524565.
Episode: 24656, steps: 1860571, epsilon: 0.151744, average reward: 17.150088.
Episode: 24720, steps: 1880287, epsilon: 0.150202, average reward: 9.120932.
Episode: 24780, steps: 1900189, epsilon: 0.148691, average reward: 9.953159.
Episode: 24818, steps: 1920218, epsilon: 0.147217, average reward: 18.060605.
Episode: 24858, steps: 1940099, epsilon: 0.145798, average reward: 16.593031.
Episode: 24904, steps: 1960178, epsilon: 0.144408, average reward: 13.978157.
Episode: 24953, steps: 1980114, epsilon: 0.143069, average reward: 12.740658.
Done.

In [7]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')


Out[7]:
<matplotlib.text.Text at 0x6b0780b8>


In [9]:
while episode_number < 30000 and agent.steps < 2980000:
    episode_number += 1
    reward = run_episode(agent, render=False)
    reward_sum += reward
    
    if agent.steps >= last_batch_steps + BATCH_SIZE:
        reward_avg = reward_sum / (episode_number - last_batch_episode)
        last_batch_episode = episode_number
        last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
        episodes.append((episode_number, agent.steps, reward_avg))

        print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
              % (episode_number, agent.steps, agent.epsilon, reward_avg))
        
        if reward_avg > reward_best:
            reward_best = reward_avg
            agent.brain.model.save_model('best.mod', False)

        reward_sum = 0
       
agent.brain.model.save_model('last.mod', False)
print('Done.')


Episode: 806, steps: 20012, epsilon: 0.964170, average reward: -1.314449.
Episode: 1568, steps: 40005, epsilon: 0.929668, average reward: -1.267090.
Episode: 2316, steps: 60007, epsilon: 0.896400, average reward: -1.244711.
Episode: 3040, steps: 80006, epsilon: 0.864339, average reward: -1.227046.
Episode: 3743, steps: 100024, epsilon: 0.833410, average reward: -1.192490.
Episode: 4381, steps: 120006, epsilon: 0.803653, average reward: -1.094363.
Episode: 5019, steps: 140019, epsilon: 0.774927, average reward: -1.096916.
Episode: 5626, steps: 160011, epsilon: 0.747270, average reward: -1.045155.
Episode: 6252, steps: 180022, epsilon: 0.720588, average reward: -1.066209.
Episode: 6861, steps: 200021, epsilon: 0.694886, average reward: -1.040312.
Episode: 7435, steps: 220058, epsilon: 0.670068, average reward: -0.981508.
Episode: 7956, steps: 240078, epsilon: 0.646169, average reward: -0.860671.
Episode: 8452, steps: 260060, epsilon: 0.623179, average reward: -0.797081.
Episode: 8938, steps: 280012, epsilon: 0.601052, average reward: -0.773937.
Episode: 9422, steps: 300011, epsilon: 0.579674, average reward: -0.749323.
Episode: 9896, steps: 320020, epsilon: 0.559059, average reward: -0.733105.
Episode: 10341, steps: 340046, epsilon: 0.539174, average reward: -0.630904.
Episode: 10776, steps: 360038, epsilon: 0.520041, average reward: -0.615549.
Episode: 11182, steps: 380024, epsilon: 0.501605, average reward: -0.499242.
Episode: 11568, steps: 400020, epsilon: 0.483826, average reward: -0.393527.
Episode: 11928, steps: 420049, epsilon: 0.466663, average reward: -0.274655.
Episode: 12286, steps: 440055, epsilon: 0.450140, average reward: -0.281907.
Episode: 12622, steps: 460048, epsilon: 0.434226, average reward: -0.133299.
Episode: 12945, steps: 480031, epsilon: 0.418894, average reward: -0.087263.
Episode: 13255, steps: 500120, epsilon: 0.404039, average reward: 0.006544.
Episode: 13520, steps: 520101, epsilon: 0.389800, average reward: 0.391778.
Episode: 13776, steps: 540037, epsilon: 0.376105, average reward: 0.473756.
Episode: 14019, steps: 560011, epsilon: 0.362879, average reward: 0.636351.
Episode: 14234, steps: 580016, epsilon: 0.350112, average reward: 0.975623.
Episode: 14452, steps: 600022, epsilon: 0.337807, average reward: 0.954006.
Episode: 14654, steps: 620064, epsilon: 0.325925, average reward: 1.251441.
Episode: 14845, steps: 640087, epsilon: 0.314485, average reward: 1.421935.
Episode: 15027, steps: 660045, epsilon: 0.303494, average reward: 1.566506.
Episode: 15210, steps: 680033, epsilon: 0.292884, average reward: 1.542400.
Episode: 15374, steps: 700256, epsilon: 0.282540, average reward: 2.052173.
Episode: 15517, steps: 720029, epsilon: 0.272792, average reward: 2.586585.
Episode: 15672, steps: 740073, epsilon: 0.263266, average reward: 2.282690.
Episode: 15797, steps: 760040, epsilon: 0.254120, average reward: 3.382139.
Episode: 15923, steps: 780111, epsilon: 0.245259, average reward: 3.388498.
Episode: 16053, steps: 800055, epsilon: 0.236773, average reward: 3.166681.
Episode: 16162, steps: 820040, epsilon: 0.228576, average reward: 4.191718.
Episode: 16284, steps: 840054, epsilon: 0.220665, average reward: 3.485517.
Episode: 16404, steps: 860085, epsilon: 0.213033, average reward: 3.547644.
Episode: 16501, steps: 880105, epsilon: 0.205682, average reward: 4.981430.
Episode: 16614, steps: 900052, epsilon: 0.198623, average reward: 3.881476.
Episode: 16704, steps: 920023, epsilon: 0.191810, average reward: 5.576601.
Episode: 16799, steps: 940390, epsilon: 0.185115, average reward: 5.215137.
Episode: 16887, steps: 960284, epsilon: 0.178814, average reward: 5.675337.
Episode: 16963, steps: 980489, epsilon: 0.172646, average reward: 7.176975.
Episode: 17026, steps: 1000490, epsilon: 0.166763, average reward: 9.227968.
Episode: 17085, steps: 1020379, epsilon: 0.161123, average reward: 9.916854.
Episode: 17146, steps: 1040091, epsilon: 0.155735, average reward: 9.408133.
Episode: 17211, steps: 1060313, epsilon: 0.150406, average reward: 8.969985.
Episode: 17270, steps: 1080478, epsilon: 0.145286, average reward: 9.953835.
Episode: 17328, steps: 1100024, epsilon: 0.140502, average reward: 10.276583.
Episode: 17379, steps: 1120422, epsilon: 0.135689, average reward: 12.669610.
Episode: 17430, steps: 1140314, epsilon: 0.131167, average reward: 12.270737.
Episode: 17482, steps: 1160085, epsilon: 0.126833, average reward: 11.671696.
Episode: 17535, steps: 1180014, epsilon: 0.122622, average reward: 11.975995.
Episode: 17581, steps: 1200383, epsilon: 0.118475, average reward: 14.133583.
Episode: 17626, steps: 1220071, epsilon: 0.114611, average reward: 14.210104.
Episode: 17670, steps: 1240216, epsilon: 0.110800, average reward: 15.101350.
Episode: 17711, steps: 1260307, epsilon: 0.107138, average reward: 16.540284.
Episode: 17751, steps: 1280553, epsilon: 0.103582, average reward: 17.141953.
Episode: 17793, steps: 1300380, epsilon: 0.100226, average reward: 15.744951.
Episode: 17830, steps: 1320375, epsilon: 0.096963, average reward: 18.829577.
Episode: 17867, steps: 1340306, epsilon: 0.093828, average reward: 18.602951.
Episode: 17903, steps: 1360409, epsilon: 0.090781, average reward: 19.541556.
Episode: 17942, steps: 1380253, epsilon: 0.087881, average reward: 17.475310.
Episode: 17983, steps: 1400095, epsilon: 0.085086, average reward: 16.419630.
Episode: 18019, steps: 1420422, epsilon: 0.082327, average reward: 20.030583.
Episode: 18055, steps: 1440499, epsilon: 0.079701, average reward: 19.754224.
Episode: 18092, steps: 1460318, epsilon: 0.077202, average reward: 18.923814.
Episode: 18128, steps: 1480265, epsilon: 0.074777, average reward: 19.340988.
Episode: 18165, steps: 1500297, epsilon: 0.072431, average reward: 19.053294.
Episode: 18199, steps: 1520225, epsilon: 0.070180, average reward: 20.868038.
Episode: 18234, steps: 1540020, epsilon: 0.068026, average reward: 19.991926.
Episode: 18271, steps: 1560569, epsilon: 0.065870, average reward: 19.540105.
Episode: 18305, steps: 1580557, epsilon: 0.063850, average reward: 21.160380.
Episode: 18339, steps: 1600015, epsilon: 0.061955, average reward: 20.350771.
Episode: 18373, steps: 1620415, epsilon: 0.060038, average reward: 21.605614.
Episode: 18409, steps: 1640582, epsilon: 0.058214, average reward: 19.655399.
Episode: 18444, steps: 1660067, epsilon: 0.056514, average reward: 19.807977.
Episode: 18478, steps: 1680290, epsilon: 0.054813, average reward: 21.169176.
Episode: 18512, steps: 1700283, epsilon: 0.053193, average reward: 21.019891.
Episode: 18547, steps: 1720473, epsilon: 0.051616, average reward: 20.724094.
Episode: 18583, steps: 1740471, epsilon: 0.050111, average reward: 19.759975.
Episode: 18616, steps: 1760271, epsilon: 0.048674, average reward: 21.368242.
Episode: 18651, steps: 1780228, epsilon: 0.047278, average reward: 20.304424.
Episode: 18686, steps: 1800406, epsilon: 0.045918, average reward: 20.739795.
Episode: 18722, steps: 1820562, epsilon: 0.044609, average reward: 19.962188.
Episode: 18755, steps: 1840160, epsilon: 0.043382, average reward: 21.648067.
Episode: 18789, steps: 1860060, epsilon: 0.042180, average reward: 21.196176.
Episode: 18823, steps: 1880460, epsilon: 0.040993, average reward: 21.870012.
Episode: 18857, steps: 1900471, epsilon: 0.039872, average reward: 20.845313.
Episode: 18891, steps: 1920341, epsilon: 0.038798, average reward: 20.792393.
Episode: 18925, steps: 1940172, epsilon: 0.037765, average reward: 20.612579.
Episode: 18959, steps: 1960572, epsilon: 0.036741, average reward: 21.402128.
Episode: 18994, steps: 1980475, epsilon: 0.035778, average reward: 20.463964.
Episode: 19027, steps: 2000275, epsilon: 0.034855, average reward: 21.461197.
Episode: 19060, steps: 2020075, epsilon: 0.033965, average reward: 21.361877.
Episode: 19094, steps: 2040475, epsilon: 0.033081, average reward: 21.201151.
Episode: 19129, steps: 2060466, epsilon: 0.032247, average reward: 19.997824.
Episode: 19162, steps: 2080266, epsilon: 0.031450, average reward: 21.078323.
Episode: 19195, steps: 2100066, epsilon: 0.030682, average reward: 21.623385.
Episode: 19229, steps: 2120466, epsilon: 0.029919, average reward: 21.369890.
Episode: 19262, steps: 2140266, epsilon: 0.029205, average reward: 21.502574.
Episode: 19295, steps: 2160066, epsilon: 0.028518, average reward: 21.541886.
Episode: 19329, steps: 2180466, epsilon: 0.027835, average reward: 21.110788.
Episode: 19362, steps: 2200266, epsilon: 0.027196, average reward: 21.301970.
Episode: 19396, steps: 2220137, epsilon: 0.026578, average reward: 20.318614.
Episode: 19430, steps: 2240537, epsilon: 0.025966, average reward: 21.972949.
Episode: 19463, steps: 2260337, epsilon: 0.025395, average reward: 21.822823.
Episode: 19497, steps: 2280207, epsilon: 0.024841, average reward: 21.130402.
Episode: 19532, steps: 2300078, epsilon: 0.024308, average reward: 19.986867.
Episode: 19566, steps: 2320478, epsilon: 0.023780, average reward: 21.988445.
Episode: 19599, steps: 2340278, epsilon: 0.023287, average reward: 21.976736.
Episode: 19633, steps: 2360095, epsilon: 0.022810, average reward: 21.372406.
Episode: 19667, steps: 2380495, epsilon: 0.022338, average reward: 21.668707.
Episode: 19700, steps: 2400295, epsilon: 0.021896, average reward: 20.524818.
Episode: 19733, steps: 2420095, epsilon: 0.021470, average reward: 21.343318.
Episode: 19767, steps: 2440495, epsilon: 0.021047, average reward: 21.967237.
Episode: 19801, steps: 2460442, epsilon: 0.020648, average reward: 21.211278.
Episode: 19834, steps: 2480242, epsilon: 0.020267, average reward: 20.513254.
Episode: 19868, steps: 2500113, epsilon: 0.019898, average reward: 21.083357.
Episode: 19903, steps: 2520583, epsilon: 0.019532, average reward: 20.038888.
Episode: 19937, steps: 2540582, epsilon: 0.019187, average reward: 21.413617.
Episode: 19972, steps: 2560533, epsilon: 0.018855, average reward: 19.901994.
Episode: 20006, steps: 2580436, epsilon: 0.018537, average reward: 21.236916.
Episode: 20039, steps: 2600236, epsilon: 0.018231, average reward: 20.967074.
Episode: 20073, steps: 2620053, epsilon: 0.017936, average reward: 21.078987.
Episode: 20108, steps: 2640524, epsilon: 0.017642, average reward: 20.687864.
Episode: 20141, steps: 2660324, epsilon: 0.017368, average reward: 20.861041.
Episode: 20174, steps: 2680124, epsilon: 0.017105, average reward: 21.678740.
Episode: 20208, steps: 2700524, epsilon: 0.016843, average reward: 21.549884.
Episode: 20241, steps: 2720324, epsilon: 0.016597, average reward: 21.531291.
Episode: 20276, steps: 2740289, epsilon: 0.016359, average reward: 19.984779.
Episode: 20309, steps: 2760089, epsilon: 0.016131, average reward: 20.775554.
Episode: 20344, steps: 2780586, epsilon: 0.015904, average reward: 20.928459.
Episode: 20377, steps: 2800258, epsilon: 0.015694, average reward: 20.921902.
Episode: 20411, steps: 2820130, epsilon: 0.015489, average reward: 20.441248.
Episode: 20445, steps: 2840530, epsilon: 0.015287, average reward: 21.524364.
Episode: 20478, steps: 2860330, epsilon: 0.015098, average reward: 21.496326.
Episode: 20512, steps: 2880229, epsilon: 0.014914, average reward: 20.891398.
Episode: 20545, steps: 2900029, epsilon: 0.014738, average reward: 21.723245.
Episode: 20579, steps: 2920429, epsilon: 0.014563, average reward: 21.604473.
Episode: 20612, steps: 2940229, epsilon: 0.014400, average reward: 21.203717.
Episode: 20645, steps: 2960029, epsilon: 0.014242, average reward: 21.506767.
Episode: 20679, steps: 2980429, epsilon: 0.014086, average reward: 20.863255.
Done.

In [10]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')
plt.savefig('learning.png', dpi=300)



In [11]:
slice_plot(n=20000, size=5, alpha=0.5)
plt.savefig('qslice.png', dpi=300)



In [12]:
pca_plot(n=20000, size=5, alpha=0.5)
plt.savefig('pca.png', dpi=300)


Exploration - exploitation trade-off

Note initiall $\epsilon$ is set to 1 which implies we are enitrely exploraing but as steps increase we reduce exploration and start leveraging the learnt space to collect rewards (a.k.a exploitation) as well.


In [9]:
def epsilon(steps):
    return MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-LAMBDA * steps)

r = range(0,EXPLORATION_STOP,int(EXPLORATION_STOP/100))
plt.plot(r, [min(epsilon(x),1) for x in r], 'r')
#plt.plot(r, [min(epsilon(x),1)**EPSILON_TRAIN_FACTOR for x in r], 'b')
plt.xlabel('step');plt.ylabel('$\epsilon$')


Out[9]:
<matplotlib.text.Text at 0xfe817b8>

Discounted Reward

We tune $\gamma$ to look ahead only a short timespan, in which the current action is of significant importance.


In [10]:
r = range(0,600)
plt.plot([t/60.0 for t in r], [GAMMA ** x for x in r], 'r')
plt.xlabel('time [s]');plt.ylabel('discount')
GAMMA


Out[10]:
0.98

In [ ]: