We are using vw for value estimate. Environment returns only win (20), lose(-20) or -1 (no continuous reward)
Design matrix is simple sparse representation of state and objects with interactions with action. Gridworld (4x4 grid, 4 objects - player, wall, win, pit) - possible states (when all objects can be initialized randomly) 24P4=500k)
Summary so far: Trained Model wins about 99% of games Vs Random model wins 30% of games
In [1]:
# Need /game_playing in path
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
sys.path.insert(0,parentdir)
In [2]:
from rl_learning import RLAgent
from bandits import BanditAlgorithm
In [16]:
from environments.gridworld import GridWorld
env = GridWorld()
# Params not required for testing
rl_params = {'experience_replay_size': 200, 'batchsize': 20, 'gamma': 0.9, 'skip_frames': 1, 'max_steps': 30,
'minibatch_method': 'random', 'train_model_after_samples': 1}
model_params = {'class': 'vw_python', 'base_folder_name': env.base_folder_name, 'loss_function': 'squared',
'l2': 0.0000000001, 'lrq': 'sdsd300', 'b': 20, 'l': 0.5}
bandit_params = 0.9
# Initialize RL agent
rl_agent = RLAgent(experience_replay_size=rl_params['experience_replay_size'], batchsize=rl_params['batchsize'],
gamma=rl_params['gamma'], skip_frames=rl_params['skip_frames'], max_steps=rl_params['max_steps'])
rl_agent.initialize(model_params, bandit_params, test=True)
stat = rl_agent.test_q_function(env, test_games=2, render=True)
print stat
In [ ]:
In [ ]: