In [1]:
import matplotlib.pyplot as plt

In [2]:
cd ..


/Users/miguel/Jottacloud/HedgingRL

Create environment


In [3]:
import json
from os import path
import pandas as pd

import gym.envs
import numpy as np

num_steps = 100
gym.envs.register(id='obs-v2',
                  entry_point='gym_bs.envs:EuropeanOptionEnv',
                  kwargs={'t': num_steps,
                          'n': 1,
                          's0': 49,
                          'k': 50,
                          'max_stock': 1,
                          'sigma': .1})

params = dict(n_iter=10000, batch_size=50, elite_frac=0.3)

env = gym.make('obs-v2')
env = gym.wrappers.Monitor(env, "/tmp/gym-results/obs-v2", video_callable=False, write_upon_reset=True, force=True)


[2017-08-02 22:10:50,136] Making new env: obs-v2
[2017-08-02 22:10:50,371] Clearing 2 monitor files from previous run (because force=True was provided)

In [4]:
observation = env.reset()

Random action


In [5]:
%%time
df = pd.DataFrame.from_dict({'reward': [], 'observation': []})
for _ in range(10):
    observation = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict({'reward': reward, 'observation': [observation]}))


CPU times: user 61.1 ms, sys: 6.48 ms, total: 67.5 ms
Wall time: 66.7 ms

In [6]:
%matplotlib inline
df.reward.clip_lower(-15).hist(bins=100)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a1f3cf8>

No action


In [7]:
%%time
df = pd.DataFrame.from_dict({'reward': [], 'underlying': [], 'tau': [], 'stocks': []})
action = np.array([0.])
for _ in range(1000):
    observation = env.reset()
    done = False
    while not done:
#         action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict({'reward': reward,
                                            'underlying': observation[0],
                                            'tau': observation[1],
                                            'stocks': observation[2]}))


CPU times: user 7.06 s, sys: 517 ms, total: 7.57 s
Wall time: 7.6 s

In [12]:
%matplotlib inline
df.reward.clip_lower(-1500).hist(bins=100)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dca45f8>

In [11]:
%matplotlib inline
# fig = plt.Figure()
df.underlying.hist(bins=20, figsize=(10, 6))


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d9f2908>

In [ ]:


In [10]:
done = False
df = pd.DataFrame.from_dict({'reward': [], 'observation': []})
while not done:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict('reward': reward, 'observation': observation))


  File "<ipython-input-10-cb69bf7ed326>", line 6
    df = df.append( pd.DataFrame.from_dict('reward': reward, 'observation': observation))
                                                   ^
SyntaxError: invalid syntax

Different reward functions

Black Scholes - three different reward functions

  • $r_T = U(V_T)$ where $U$ is a utility function

  • $r_T = - (V_T - E[V_0])^2$

  • $V_T$ with hedging costs and utility function as in first vase

Portfolio:

  • 1 option
  • 0 cash
  • 0 stocks

Variations:

  • can only hedge once / X times

Interesting

how to learn the value function?