Frameworks - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice.
%env THEANO_FLAGS = 'floatX=float32'
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
!bash ../xvfb start
os.environ['DISPLAY'] = ':1'
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
env = gym.make("CartPole-v0").env
n_actions = env.action_space.n
state_dim = env.observation_space.shape
First step is initializing input variables
import theano
import theano.tensor as T
# create input variables. We'll support multiple states at once
current_states = T.matrix("states[batch,units]")
actions = T.ivector("action_ids[batch]")
rewards = T.vector("rewards[batch]")
next_states = T.matrix("next states[batch,units]")
is_end = T.ivector("vector[batch] where 1 means that session just ended")
import lasagne
from lasagne.layers import *
# input layer
l_states = InputLayer((None,)+state_dim)
<Your architecture. Please start with a single-layer network>
# output layer
l_qvalues = DenseLayer( <previous_layer> , num_units=n_actions, nonlinearity=None)
# get q-values for ALL actions in current_states
predicted_qvalues = get_output(l_qvalues, {l_states: current_states})
# compiling agent's "GetQValues" function
get_qvalues = <compile a function that takes current_states and returns predicted_qvalues>
# select q-values for chosen actions
predicted_qvalues_for_actions = predicted_qvalues[T.arange(
actions.shape[0]), actions]
# predict q-values for next states
predicted_next_qvalues = get_output(l_qvalues, {l_states: < theano input with for states> })
# Computing target q-values under
gamma = 0.99
target_qvalues_for_actions = <target Q-values using rewards and predicted_next_qvalues>
# zero-out q-values at the end
target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions
# don't compute gradient over target q-values (consider constant)
target_qvalues_for_actions = theano.gradient.disconnected_grad(
# mean squared error loss function
loss = <mean squared between target_qvalues_for_actions and predicted_qvalues_for_actions>
# all network weights
all_weights = get_all_params(l_qvalues, trainable=True)
# network updates. Note the small learning rate (for stability)
updates = lasagne.updates.sgd(loss, all_weights, learning_rate=1e-4)
# Training function that resembles agent.update(state,action,reward,next_state)
# with 1 more argument meaning is_end
train_step = theano.function([current_states, actions, rewards, next_states, is_end],
epsilon = 0.25 # initial epsilon
def generate_session(t_max=1000):
"""play env with approximate q-learning agent and train it at the same time"""
total_reward = 0
s = env.reset()
for t in range(t_max):
# get action q-values from the network
q_values = get_qvalues([s])[0]
a = <YOUR CODE: epsilon-greedily selected action>
new_s, r, done, info = env.step(a)
# train agent one step. Note that we use one-element arrays instead of scalars
# because that's what function accepts.
train_step([s], [a], [r], [new_s], [done])
total_reward += r
s = new_s
if done:
return total_reward
for i in range(100):
rewards = [generate_session() for _ in range(100)] # generate new sessions
epsilon *= 0.95
print("mean reward:%.3f\tepsilon:%.5f" % (np.mean(rewards), epsilon))
if np.mean(rewards) > 300:
print("You Win!")
assert epsilon != 0, "Please explore environment"
epsilon = 0 # Don't forget to reset epsilon back to initial value if you want to go on training
# record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("CartPole-v0"),
directory="videos", force=True)
sessions = [generate_session() for _ in range(100)]
# show video
from IPython.display import HTML
import os
video_names = list(
filter(lambda s: s.endswith(".mp4"), os.listdir("./videos/")))
<video width="640" height="480" controls>
<source src="{}" type="video/mp4">
""".format("./videos/" + video_names[-1])) # this may or may not be _last_ video. Try other indices
