In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')
%matplotlib inline
sns.set()
In [2]:
def pca_plot(n=10000, alpha=1.0, size=5):
_, samples = agent.memory.sample(n)
states = np.array([ o[0] for o in samples ], dtype=np.float32)
qsa = agent.brain.predict(states)[0]
from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pca.fit(states)
X = pca.transform(states)
fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
plt.scatter(X[:, 0], X[:, 1], c=np.argmax(qsa, 1), alpha=alpha, s=size, cmap="rainbow")
ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
plt.scatter(X[:, 0], X[:, 1], c=qsa[:,0], alpha=alpha, s=size, cmap="rainbow")
ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
plt.scatter(X[:, 0], X[:, 1], c=qsa[:,1], alpha=alpha, s=size, cmap="rainbow")
ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
plt.scatter(X[:, 0], X[:, 1], c=qsa[:,2], alpha=alpha, s=size, cmap="rainbow")
For a more meaningful plot, just project sample across $x_m$ and $y_m$.
In [3]:
def slice_plot(n=10000, alpha=1.0, size=5):
_, samples = agent.memory.sample(n)
states = np.array([ o[0] for o in samples ], dtype=np.float32)
qsa = agent.brain.predict(states)[0]
fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
ax = axes[0,0]; plt.sca(ax);ax.set_title('Action')
plt.scatter(states[:, 0], states[:, 1], c=np.argmax(qsa, 1), alpha=alpha, s=size, cmap="rainbow")
ax = axes[0,1]; plt.sca(ax);ax.set_title('Q(s,no-change)')
plt.scatter(states[:, 0], states[:, 1], c=qsa[:,0], alpha=alpha, s=size, cmap="rainbow")
ax = axes[1,0]; plt.sca(ax);ax.set_title('Q(s,left)')
plt.scatter(states[:, 0], states[:, 1], c=qsa[:,1], alpha=alpha, s=size, cmap="rainbow")
ax = axes[1,1]; plt.sca(ax);ax.set_title('Q(s,right)')
plt.scatter(states[:, 0], states[:, 1], c=qsa[:,2], alpha=alpha, s=size, cmap="rainbow")
axes[0,0].set_ylabel('$y_m$')
axes[1,0].set_ylabel('$y_m$')
axes[1,0].set_xlabel('$x_m$')
axes[1,1].set_xlabel('$x_m$')
In [4]:
def run_episode(agent, render=False):
s = env.reset()
R = 0
while True:
if render: env.render()
a = agent.act(s.astype(np.float32))
s_, r, done, info = env.step(a)
if done:
s_ = None
agent.observe((s, a, r, s_))
s = s_
R += r
if done:
agent.endEpisode()
return R
In [5]:
from DriveItGym import DriveItEnv
from agent import Agent
BATCH_SIZE = 20000
env = DriveItEnv(time_limit=10.0, throttle_limit=1.0)
stateCnt = env.observation_space.shape[0]
actionCnt = env.action_space.n
agent = Agent(stateCnt, actionCnt)
episode_number = 0
last_batch_episode = 0
last_batch_steps = 0
episodes = []
reward_sum = 0
reward_best = 18.0
(stateCnt, actionCnt)
Out[5]:
In [6]:
while episode_number < 50000 and agent.steps < 1980000:
episode_number += 1
reward = run_episode(agent, render=False)
reward_sum += reward
if agent.steps >= last_batch_steps + BATCH_SIZE:
reward_avg = reward_sum / (episode_number - last_batch_episode)
last_batch_episode = episode_number
last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
episodes.append((episode_number, agent.steps, reward_avg))
print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
% (episode_number, agent.steps, agent.epsilon, reward_avg))
if reward_avg > reward_best:
reward_best = reward_avg
agent.brain.model.save_model('best.mod')
reward_sum = 0
agent.brain.model.save_model('last.mod')
print('Done.')
In [7]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')
Out[7]:
In [9]:
while episode_number < 30000 and agent.steps < 2980000:
episode_number += 1
reward = run_episode(agent, render=False)
reward_sum += reward
if agent.steps >= last_batch_steps + BATCH_SIZE:
reward_avg = reward_sum / (episode_number - last_batch_episode)
last_batch_episode = episode_number
last_batch_steps = int(agent.steps / BATCH_SIZE) * BATCH_SIZE
episodes.append((episode_number, agent.steps, reward_avg))
print('Episode: %d, steps: %d, epsilon: %f, average reward: %f.' \
% (episode_number, agent.steps, agent.epsilon, reward_avg))
if reward_avg > reward_best:
reward_best = reward_avg
agent.brain.model.save_model('best.mod', False)
reward_sum = 0
agent.brain.model.save_model('last.mod', False)
print('Done.')
In [10]:
plt.plot([e[1]/1000 for e in episodes], [e[2] for e in episodes])
plt.xlabel('steps x 1000');plt.ylabel('reward')
plt.savefig('learning.png', dpi=300)
In [11]:
slice_plot(n=20000, size=5, alpha=0.5)
plt.savefig('qslice.png', dpi=300)
In [12]:
pca_plot(n=20000, size=5, alpha=0.5)
plt.savefig('pca.png', dpi=300)
In [9]:
def epsilon(steps):
return MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-LAMBDA * steps)
r = range(0,EXPLORATION_STOP,int(EXPLORATION_STOP/100))
plt.plot(r, [min(epsilon(x),1) for x in r], 'r')
#plt.plot(r, [min(epsilon(x),1)**EPSILON_TRAIN_FACTOR for x in r], 'b')
plt.xlabel('step');plt.ylabel('$\epsilon$')
Out[9]:
In [10]:
r = range(0,600)
plt.plot([t/60.0 for t in r], [GAMMA ** x for x in r], 'r')
plt.xlabel('time [s]');plt.ylabel('discount')
GAMMA
Out[10]:
In [ ]: