In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import gym
import numpy as np
In [3]:
env = gym.make('FrozenLake-v0')
# env = gym.make('')
The value function $Q(s,a)$ defines how good it is to take action $a$ when in state $s$.
When
In [4]:
env = gym.make('Acrobot-v1')
In [5]:
env.observation_space.sample()
Out[5]:
In [6]:
#Initialize table with all zeros
Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
# The Q-Table learning algorithm
while j < 99:
j+=1
#Choose an action by greedily (with noise) picking from Q table
a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
#Get new state and reward from environment
s1,r,d,_ = env.step(a)
#Update Q-Table with new knowledge
Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
rAll += r
s = s1
if d == True:
break
#jList.append(j)
rList.append(rAll)
In [ ]:
print("Final Q-Table Values")
print(Q)
In [ ]:
print("Score over time: " + str(sum(rList)/num_episodes))
In [22]:
cd ..
In [20]:
import gym
import gym.envs
import numpy as np
In [23]:
import gym_bs
In [ ]:
gym.spaces.Tuple?
In [18]:
gym.envs.register(id='bs-v3',
entry_point='gym_bs.envs:EuropeanOptionEnv',
kwargs={'t': 1000})
In [ ]:
gym.wrappers.Monitor?
In [24]:
env = gym.make('bs-v3')
env = gym.wrappers.Monitor(env, "/tmp/gym-results/bs-v3", video_callable=False, write_upon_reset=True, force=True)
In [ ]:
env.observation_space.sample()
# env.action_space.contains(1)
In [ ]:
env.observation_space.sample()
In [ ]:
# Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
done = False
while not done:
a = env.action_space.sample()
state, reward, done, _ = env.step(a)
print("=======", reward)
# if i>2: break
In [ ]:
state, reward
In [14]:
import seaborn as sns
In [39]:
import json, pandas as pd
import pickle
#
In [ ]:
with open('/tmp/gym-results/bs-v3/openaigym.episode_batch.2.8920.stats.json') as f: x = json.load(f)
In [ ]:
%matplotlib inline
sns.plt.plot(env.underlying)
In [16]:
def do_episode(policy, env, num_steps, render=False):
total_rew = 0
ob = env.reset()
for t in range(num_steps):
a = policy.act(ob)
(ob, reward, done, _info) = env.step(a)
total_rew += reward
if render and t%3==0: env.render()
if done:
# print "the game has been done."
break
return total_rew
In [ ]:
env.stock * s1[0] + env.cash
In [ ]:
env.option.calc(0, 0.02)
In [17]:
from blackscholes import blackScholes
blackScholes(0, 1, 1, 0.1)
Out[17]:
In [26]:
class DeterministicContinuousActionLinearPolicy(object):
"""
Taken from https://gym.openai.com/evaluations/eval_sXJlX4GVQouaTYTkWemOA
"""
def __init__(self, theta, ob_space, ac_space):
"""
dim_ob: dimension of observations
dim_ac: dimension of action vector
theta: flat vector of parameters
"""
self.ac_space = ac_space
dim_ob = ob_space.shape[0]
dim_ac = ac_space.shape[0]
print(theta, len(theta), dim_ob, dim_ac)
assert len(theta) == (dim_ob + 1) * dim_ac
self.W = theta[0 : dim_ob * dim_ac].reshape(dim_ob, dim_ac)
self.b = theta[dim_ob * dim_ac : None]
def act(self, ob):
a = np.clip(ob.dot(self.W) + self.b, self.ac_space.low, self.ac_space.high)
return a
In [27]:
import json, sys, os
from os import path
def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
"""
Generic implementation of the cross-entropy method for maximizing a black-box function
f: a function mapping from vector -> scalar
th_mean: initial mean over input distribution
batch_size: number of samples of theta to evaluate per batch
n_iter: number of batches
elite_frac: each batch, select this fraction of the top-performing samples
initial_std: initial standard deviation over parameter vectors
"""
n_elite = int(np.round(batch_size*elite_frac))
th_std = np.ones_like(th_mean) * initial_std
for _ in range(n_iter):
ths = np.array([th_mean + dth for dth in th_std[None,:] * np.random.randn(batch_size, th_mean.size)])
ys = np.array([f(th) for th in ths])
elite_inds = ys.argsort()[::-1][:n_elite]
elite_ths = ths[elite_inds]
th_mean = elite_ths.mean(axis=0)
th_std = elite_ths.std(axis=0)
yield {'ys' : ys, 'theta_mean' : th_mean, 'y_mean' : ys.mean()}
def do_rollout(agent, env, num_steps, render=False):
total_rew, t = 0, 0
ob = env.reset()
done = False
while not done:
# for t in range(num_steps):
a = agent.act(ob)
(ob, reward, done, _info) = env.step(a)
total_rew += reward
t += 1
# if render and t%3==0: env.render()
# if done: break
return total_rew, t+1
# env = gym.make('bs-v3')
# env.seed(0)
# np.random.seed(0)
params = dict(n_iter=10, batch_size=25, elite_frac = 0.2)
num_steps = 200
# You provide the directory to write to (can be an existing
# directory, but can't contain previous monitor results. You can
# also dump to a tempdir if you'd like: tempfile.mkdtemp().
# outdir = '/tmp/cem-agent-results'
# env = gym.wrappers.Monitor(env, outdir, force=True)
# Prepare snapshotting
# ----------------------------------------
def writefile(fname, s):
with open(path.join('/tmp/cem-agent-results/', fname), 'w') as fh: fh.write(s)
info = {}
info['params'] = params
info['env_id'] = env.spec.id
# ------------------------------------------
def noisy_evaluation(theta):
print(theta)
agent = DeterministicContinuousActionLinearPolicy(theta, env.observation_space, env.action_space)
rew, T = do_rollout(agent, env, num_steps)
return rew
# Train the agent, and snapshot each stage
for (i, iterdata) in enumerate(cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)):
print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean']))
print(iterdata['theta_mean'])
agent = DeterministicContinuousActionLinearPolicy(iterdata['theta_mean'][0], env.observation_space, env.action_space)
do_rollout(agent, env, 200, render=False)
writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1)))
# Write out the env at the end so we store the parameters of this
# environment.
writefile('info.json', json.dumps(info))
env.close()
# logger.info("Successfully ran cross-entropy method. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
# gym.upload(outdir)
In [43]:
iterdata['ys'].flatten()[iterdata['ys'].flatten().argsort()]
Out[43]:
In [39]:
iterdata['ys'].flatten().argsort()
Out[39]:
In [ ]:
iterdata['ys'].argsort
In [48]:
len(iterdata['theta_mean'][0]) == (env.observation_space.shape[0] + 1) * env.action_space.shape[0]
Out[48]:
In [35]:
env.observation_space.shape[0] + env.action_space.shape[0]
Out[35]:
In [32]:
import pickle
In [38]:
with open('/tmp/cem-agent-results/agent-0099.pkl', 'rb') as f: x = pickle.loads(f.read())
In [14]:
np.mean(np.random.randn(3+1, 2), axis=0)
Out[14]: