%load_ext autoreload
%autoreload 2

import gym
import numpy as np

env = gym.make('FrozenLake-v0')
# env = gym.make('')

  • Initialize $V(s)$ arbitrarily
  • Repeat for each episode
  • Initialize s
  • Repeat (for each step of episode)
    • $\alpha \leftarrow$ action given by $\pi$ for $s$
    • Take action a, observe reward r, and next state s'
    • $V(s) \leftarrow V(s) + \alpha [r + \gamma V(s') - V(s)]$
    • $s \leftarrow s'$
  • until $s$ is terminal

Value Function

The value function $Q(s,a)$ defines how good it is to take action $a$ when in state $s$.


env = gym.make('Acrobot-v1')

-0.91876417]

#Initialize table with all zeros
Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    # The Q-Table learning algorithm
    while j < 99:
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:

print("Final Q-Table Values")

print("Score over time: " +  str(sum(rList)/num_episodes))

cd ..


import gym
import gym.envs
import numpy as np

import gym_bs

In [ ]:

                  kwargs={'t': 1000})

In [ ]:

env = gym.make('bs-v3')
env = gym.wrappers.Monitor(env, "/tmp/gym-results/bs-v3", video_callable=False, write_upon_reset=True, force=True)

# env.action_space.contains(1)

In [ ]:

# Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    done = False
    while not done:
        a = env.action_space.sample()
        state, reward, done, _ = env.step(a)
    print("=======", reward)
#     if i>2: break

state, reward

import seaborn as sns

import json, pandas as pd
import pickle

with open('/tmp/gym-results/bs-v3/openaigym.episode_batch.2.8920.stats.json') as f: x = json.load(f)

%matplotlib inline

def do_episode(policy, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = policy.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t%3==0: env.render()
        if done: 
            # print "the game has been done."
    return total_rew

env.stock * s1[0]

env.option.calc(0, 0.02)

from blackscholes import blackScholes
blackScholes(0, 1, 1, 0.1)

{'delta': 0.5, 'gamma': 0, 'npv': 0.0, 'theta': 0, 'vega': 0.0}

class DeterministicContinuousActionLinearPolicy(object):
    Taken from

    def __init__(self, theta, ob_space, ac_space):
        dim_ob: dimension of observations
        dim_ac: dimension of action vector
        theta: flat vector of parameters
        self.ac_space = ac_space
        dim_ob = ob_space.shape[0]
        dim_ac = ac_space.shape[0]
        print(theta, len(theta), dim_ob, dim_ac)
        assert len(theta) == (dim_ob + 1) * dim_ac
        self.W = theta[0 : dim_ob * dim_ac].reshape(dim_ob, dim_ac)
        self.b = theta[dim_ob * dim_ac : None]

    def act(self, ob):
        a = np.clip( + self.b, self.ac_space.low, self.ac_space.high)
        return a

import json, sys, os
from os import path

def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
    Generic implementation of the cross-entropy method for maximizing a black-box function

    f: a function mapping from vector -> scalar
    th_mean: initial mean over input distribution
    batch_size: number of samples of theta to evaluate per batch
    n_iter: number of batches
    elite_frac: each batch, select this fraction of the top-performing samples
    initial_std: initial standard deviation over parameter vectors
    n_elite = int(np.round(batch_size*elite_frac))
    th_std = np.ones_like(th_mean) * initial_std

    for _ in range(n_iter):
        ths = np.array([th_mean + dth for dth in th_std[None,:] * np.random.randn(batch_size, th_mean.size)])
        ys = np.array([f(th) for th in ths])
        elite_inds = ys.argsort()[::-1][:n_elite]
        elite_ths = ths[elite_inds]
        th_mean = elite_ths.mean(axis=0)
        th_std = elite_ths.std(axis=0)
        yield {'ys' : ys, 'theta_mean' : th_mean, 'y_mean' : ys.mean()}

def do_rollout(agent, env, num_steps, render=False):
    total_rew, t = 0, 0
    ob = env.reset()
    done = False
    while not done:
#     for t in range(num_steps):
        a = agent.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        t += 1
#         if render and t%3==0: env.render()
#         if done: break
    return total_rew, t+1

# env = gym.make('bs-v3')
# env.seed(0)
# np.random.seed(0)
params = dict(n_iter=10, batch_size=25, elite_frac = 0.2)
num_steps = 200

# You provide the directory to write to (can be an existing
# directory, but can't contain previous monitor results. You can
# also dump to a tempdir if you'd like: tempfile.mkdtemp().
# outdir = '/tmp/cem-agent-results'
# env = gym.wrappers.Monitor(env, outdir, force=True)

# Prepare snapshotting
# ----------------------------------------
def writefile(fname, s):
    with open(path.join('/tmp/cem-agent-results/', fname), 'w') as fh: fh.write(s)
info = {}
info['params'] = params
info['env_id'] =
# ------------------------------------------

def noisy_evaluation(theta):
    agent = DeterministicContinuousActionLinearPolicy(theta, env.observation_space, env.action_space)
    rew, T = do_rollout(agent, env, num_steps)
    return rew

# Train the agent, and snapshot each stage
for (i, iterdata) in enumerate(cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)):
    print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean']))
    agent = DeterministicContinuousActionLinearPolicy(iterdata['theta_mean'][0], env.observation_space, env.action_space)
    do_rollout(agent, env, 200, render=False)
    writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1)))

# Write out the env at the end so we store the parameters of this
# environment.
writefile('info.json', json.dumps(info))


#"Successfully ran cross-entropy method. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
#     gym.upload(outdir)

Iteration  0. Episode mean reward: -1193543.655
[[-0.37030113  0.24325921 -1.28778306 -2.97953413]]
[-0.37030113  0.24325921 -1.28778306 -2.97953413] 4 3 1
array([ -2.11486329e+06,  -2.11043706e+06,  -1.98097114e+06,
        -1.96668867e+06,  -1.85924434e+06,  -1.83225317e+06,
        -1.78758212e+06,  -1.76561868e+06,  -1.76444708e+06,
        -1.46047481e+06,  -1.45153023e+06,  -1.40418020e+06,
        -1.38885617e+06,  -1.08677060e+06,  -9.93212842e+05,
        -8.49137387e+05,  -8.43171633e+05,  -7.90175585e+05,
        -7.70819880e+05,  -5.80217897e+05,  -5.44619892e+05,
        -5.11679655e+05,  -2.66302286e+05,   1.34923719e+02]

5,  9,  4,  3,  2, 24,  0,  1]
        5,  9,  4,  3,  2, 24,  0,  1])

len(iterdata['theta_mean'][0]) == (env.observation_space.shape[0] + 1) * env.action_space.shape[0]


env.observation_space.shape[0] + env.action_space.shape[0]


import pickle

with open('/tmp/cem-agent-results/agent-0099.pkl', 'rb') as f: x = pickle.loads(

np.mean(np.random.randn(3+1, 2), axis=0)

array([ 0.43051544, -0.20901409])