In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
import gym
import numpy as np



In [3]:

    
env = gym.make('FrozenLake-v0')
# env = gym.make('')









    



[2017-04-26 23:44:53,867] Making new env: FrozenLake-v0

Q-Learning

Initialize $V(s)$ arbitrarily
Repeat for each episode
Initialize s
Repeat (for each step of episode)
- $\alpha \leftarrow$ action given by $\pi$ for $s$
- Take action a, observe reward r, and next state s'
- $V(s) \leftarrow V(s) + \alpha [r + \gamma V(s') - V(s)]$
- $s \leftarrow s'$
until $s$ is terminal

Value Function

The value function $Q(s,a)$ defines how good it is to take action $a$ when in state $s$.

When



In [4]:

    
env = gym.make('Acrobot-v1')









    



[2017-04-26 23:44:54,154] Making new env: Acrobot-v1



In [5]:

    
env.observation_space.sample()









    Out[5]:





array([ 0.09762701,  0.43037873,  0.20552675,  0.08976637, -1.91876417,
        8.25011773])



In [6]:

    
#Initialize table with all zeros
Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    # The Q-Table learning algorithm
    while j < 99:
        j+=1
        
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-466a8ee1b62a> in <module>()
      1 #Initialize table with all zeros
----> 2 Q = np.zeros([ env.observation_space.n, env.action_space.n])
      3 # Set learning parameters
      4 lr = .85
      5 y = .99

AttributeError: 'Box' object has no attribute 'n'



In [ ]:

    
print("Final Q-Table Values")
print(Q)



In [ ]:

    
print("Score over time: " +  str(sum(rList)/num_episodes))



In [22]:

    
cd ..









    



/Users/miguel/Jottacloud/HedgingRL



In [20]:

    
import gym
import gym.envs
import numpy as np



In [23]:

    
import gym_bs



In [ ]:

    
gym.spaces.Tuple?



In [18]:

    
gym.envs.register(id='bs-v3',
                  entry_point='gym_bs.envs:EuropeanOptionEnv',
                  kwargs={'t': 1000})



In [ ]:

    
gym.wrappers.Monitor?



In [24]:

    
env = gym.make('bs-v3')
env = gym.wrappers.Monitor(env, "/tmp/gym-results/bs-v3", video_callable=False, write_upon_reset=True, force=True)









    



[2017-04-27 12:06:33,444] Making new env: bs-v3
[2017-04-27 12:06:33,831] Clearing 2 monitor files from previous run (because force=True was provided)



In [ ]:

    
env.observation_space.sample()
# env.action_space.contains(1)



In [ ]:

    
env.observation_space.sample()



In [ ]:

    
# Q = np.zeros([ env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    
    done = False
    while not done:
        a = env.action_space.sample()
        state, reward, done, _ = env.step(a)
    print("=======", reward)
#     if i>2: break



In [ ]:

    
state, reward



In [14]:

    
import seaborn as sns



In [39]:

    
import json, pandas as pd
import pickle
#



In [ ]:

    
with open('/tmp/gym-results/bs-v3/openaigym.episode_batch.2.8920.stats.json') as f: x = json.load(f)



In [ ]:

    
%matplotlib inline
sns.plt.plot(env.underlying)



In [16]:

    
def do_episode(policy, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = policy.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t%3==0: env.render()
        if done: 
            # print "the game has been done."
            break
    return total_rew



In [ ]:

    
env.stock * s1[0] + env.cash



In [ ]:

    
env.option.calc(0, 0.02)



In [17]:

    
from blackscholes import blackScholes
blackScholes(0, 1, 1, 0.1)









    Out[17]:





{'delta': 0.5, 'gamma': 0, 'npv': 0.0, 'theta': 0, 'vega': 0.0}



In [26]:

    
class DeterministicContinuousActionLinearPolicy(object):
    """
    Taken from https://gym.openai.com/evaluations/eval_sXJlX4GVQouaTYTkWemOA
    """

    def __init__(self, theta, ob_space, ac_space):
        """
        dim_ob: dimension of observations
        dim_ac: dimension of action vector
        theta: flat vector of parameters
        """
        self.ac_space = ac_space
        dim_ob = ob_space.shape[0]
        dim_ac = ac_space.shape[0]
        print(theta, len(theta), dim_ob, dim_ac)
        assert len(theta) == (dim_ob + 1) * dim_ac
        self.W = theta[0 : dim_ob * dim_ac].reshape(dim_ob, dim_ac)
        self.b = theta[dim_ob * dim_ac : None]

    def act(self, ob):
        a = np.clip(ob.dot(self.W) + self.b, self.ac_space.low, self.ac_space.high)
        return a



In [27]:

    
import json, sys, os
from os import path

def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
    """
    Generic implementation of the cross-entropy method for maximizing a black-box function

    f: a function mapping from vector -> scalar
    th_mean: initial mean over input distribution
    batch_size: number of samples of theta to evaluate per batch
    n_iter: number of batches
    elite_frac: each batch, select this fraction of the top-performing samples
    initial_std: initial standard deviation over parameter vectors
    """
    n_elite = int(np.round(batch_size*elite_frac))
    th_std = np.ones_like(th_mean) * initial_std

    for _ in range(n_iter):
        ths = np.array([th_mean + dth for dth in th_std[None,:] * np.random.randn(batch_size, th_mean.size)])
        ys = np.array([f(th) for th in ths])
        elite_inds = ys.argsort()[::-1][:n_elite]
        elite_ths = ths[elite_inds]
        th_mean = elite_ths.mean(axis=0)
        th_std = elite_ths.std(axis=0)
        yield {'ys' : ys, 'theta_mean' : th_mean, 'y_mean' : ys.mean()}

def do_rollout(agent, env, num_steps, render=False):
    total_rew, t = 0, 0
    ob = env.reset()
    done = False
    while not done:
#     for t in range(num_steps):
        a = agent.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        t += 1
#         if render and t%3==0: env.render()
#         if done: break
    return total_rew, t+1

# env = gym.make('bs-v3')
# env.seed(0)
# np.random.seed(0)
params = dict(n_iter=10, batch_size=25, elite_frac = 0.2)
num_steps = 200

# You provide the directory to write to (can be an existing
# directory, but can't contain previous monitor results. You can
# also dump to a tempdir if you'd like: tempfile.mkdtemp().
# outdir = '/tmp/cem-agent-results'
# env = gym.wrappers.Monitor(env, outdir, force=True)

# Prepare snapshotting
# ----------------------------------------
def writefile(fname, s):
    with open(path.join('/tmp/cem-agent-results/', fname), 'w') as fh: fh.write(s)
info = {}
info['params'] = params
info['env_id'] = env.spec.id
# ------------------------------------------

def noisy_evaluation(theta):
    print(theta)
    agent = DeterministicContinuousActionLinearPolicy(theta, env.observation_space, env.action_space)
    rew, T = do_rollout(agent, env, num_steps)
    return rew

# Train the agent, and snapshot each stage
for (i, iterdata) in enumerate(cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)):
    print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean']))
    print(iterdata['theta_mean'])
    agent = DeterministicContinuousActionLinearPolicy(iterdata['theta_mean'][0], env.observation_space, env.action_space)
    do_rollout(agent, env, 200, render=False)
    writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1)))

# Write out the env at the end so we store the parameters of this
# environment.
writefile('info.json', json.dumps(info))

env.close()

#     logger.info("Successfully ran cross-entropy method. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
#     gym.upload(outdir)









    



[-0.37030113  0.24325921 -1.28778306 -2.97953413]
[-0.37030113  0.24325921 -1.28778306 -2.97953413] 4 3 1
[ 0.43727774 -1.75332178  0.77420752  1.1466121 ]
[ 0.43727774 -1.75332178  0.77420752  1.1466121 ] 4 3 1
[-0.07263471  3.08798246 -0.99612687  1.414658  ]
[-0.07263471  3.08798246 -0.99612687  1.414658  ] 4 3 1
[ 0.71772567  0.87022845 -0.01085429 -0.52642814]
[ 0.71772567  0.87022845 -0.01085429 -0.52642814] 4 3 1
[ 0.37001059 -0.31940614 -0.21332629 -0.19606673]
[ 0.37001059 -0.31940614 -0.21332629 -0.19606673] 4 3 1
[ 0.80350664  0.77927668  0.02543269 -2.02610253]
[ 0.80350664  0.77927668  0.02543269 -2.02610253] 4 3 1
[-0.0859375   0.43071566 -0.2717998   0.79093699]
[-0.0859375   0.43071566 -0.2717998   0.79093699] 4 3 1
[-0.40422462  0.68904459 -2.26047547 -1.75138976]
[-0.40422462  0.68904459 -2.26047547 -1.75138976] 4 3 1
[ 1.47430016  0.76587985  0.42192568  0.71451707]
[ 1.47430016  0.76587985  0.42192568  0.71451707] 4 3 1
[ 0.55913888  0.8401503   0.18674029 -0.32851111]
[ 0.55913888  0.8401503   0.18674029 -0.32851111] 4 3 1
[-0.72109619  0.56129933 -0.25896416 -0.16632339]
[-0.72109619  0.56129933 -0.25896416 -0.16632339] 4 3 1
[ 1.23126402  1.05353349 -0.13316953 -0.18977998]
[ 1.23126402  1.05353349 -0.13316953 -0.18977998] 4 3 1
[-2.45278132  0.69625867  0.48153432  0.91262888]
[-2.45278132  0.69625867  0.48153432  0.91262888] 4 3 1
[-1.55859677 -0.26656714 -0.11958638  0.96835093]
[-1.55859677 -0.26656714 -0.11958638  0.96835093] 4 3 1
[ 1.66494591 -0.64786835 -1.08102822  0.04392952]
[ 1.66494591 -0.64786835 -1.08102822  0.04392952] 4 3 1
[ 0.30141198 -0.20658564 -0.04246956 -2.09161426]
[ 0.30141198 -0.20658564 -0.04246956 -2.09161426] 4 3 1
[-2.83804351  1.27529303  1.14948085 -0.05894355]
[-2.83804351  1.27529303  1.14948085 -0.05894355] 4 3 1
[ 0.09021319  1.08443145 -0.83429923 -1.3192176 ]
[ 0.09021319  1.08443145 -0.83429923 -1.3192176 ] 4 3 1
[ 1.36507477  0.22497759  0.37987163  1.38727224]
[ 1.36507477  0.22497759  0.37987163  1.38727224] 4 3 1
[-0.09169312  0.67627347 -0.62657946 -0.18902558]
[-0.09169312  0.67627347 -0.62657946 -0.18902558] 4 3 1
[-1.72774945  1.64948735  1.4020208   0.33116195]
[-1.72774945  1.64948735  1.4020208   0.33116195] 4 3 1
[-0.43105106 -0.24540923  1.56736868  0.24035922]
[-0.43105106 -0.24540923  1.56736868  0.24035922] 4 3 1
[-0.87478594  2.24664628 -0.01890323  0.31209147]
[-0.87478594  2.24664628 -0.01890323  0.31209147] 4 3 1
[-0.399832   -0.92734914  0.76651054 -0.4487624 ]
[-0.399832   -0.92734914  0.76651054 -0.4487624 ] 4 3 1
[-0.10344779  0.7774305   0.5126041   1.64198928]
[-0.10344779  0.7774305   0.5126041   1.64198928] 4 3 1
Iteration  0. Episode mean reward: -1193543.655
[[-0.37030113  0.24325921 -1.28778306 -2.97953413]]
[-0.37030113  0.24325921 -1.28778306 -2.97953413] 4 3 1






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-3f4cc89443d4> in <module>()
     72     agent = DeterministicContinuousActionLinearPolicy(iterdata['theta_mean'][0], env.observation_space, env.action_space)
     73     do_rollout(agent, env, 200, render=False)
---> 74     writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1)))
     75 
     76 # Write out the env at the end so we store the parameters of this

NameError: name 'pickle' is not defined



In [43]:

    
iterdata['ys'].flatten()[iterdata['ys'].flatten().argsort()]









    Out[43]:





array([ -2.11486329e+06,  -2.11043706e+06,  -1.98097114e+06,
        -1.96668867e+06,  -1.85924434e+06,  -1.83225317e+06,
        -1.78758212e+06,  -1.76561868e+06,  -1.76444708e+06,
        -1.46047481e+06,  -1.45153023e+06,  -1.40418020e+06,
        -1.38885617e+06,  -1.08677060e+06,  -9.93212842e+05,
        -8.49137387e+05,  -8.43171633e+05,  -7.90175585e+05,
        -7.70819880e+05,  -5.80217897e+05,  -5.44619892e+05,
        -5.11679655e+05,  -2.66302286e+05,   1.34923719e+02,
         2.84528324e+05])



In [39]:

    
iterdata['ys'].flatten().argsort()









    Out[39]:





array([11, 10, 19, 12, 16, 20, 15, 14, 13, 18, 21, 17, 23, 22,  8,  6,  7,
        5,  9,  4,  3,  2, 24,  0,  1])



In [ ]:

    
iterdata['ys'].argsort



In [48]:

    
len(iterdata['theta_mean'][0]) == (env.observation_space.shape[0] + 1) * env.action_space.shape[0]









    Out[48]:





True



In [35]:

    
env.observation_space.shape[0] + env.action_space.shape[0]









    Out[35]:





4



In [32]:

    
import pickle



In [38]:

    
with open('/tmp/cem-agent-results/agent-0099.pkl', 'rb') as f: x = pickle.loads(f.read())









    



---------------------------------------------------------------------------
UnpicklingError                           Traceback (most recent call last)
<ipython-input-38-a401c0bad2fc> in <module>()
----> 1 with open('/tmp/cem-agent-results/agent-0099.pkl', 'rb') as f: x = pickle.loads(f.read())

UnpicklingError: unpickling stack underflow



In [14]:

    
np.mean(np.random.randn(3+1, 2), axis=0)









    Out[14]:





array([ 0.43051544, -0.20901409])