In [1]:

    
import matplotlib.pyplot as plt



In [2]:

    
cd ..









    



/Users/miguel/Jottacloud/HedgingRL

Create environment



In [3]:

    
import json
from os import path
import pandas as pd

import gym.envs
import numpy as np

num_steps = 100
gym.envs.register(id='obs-v2',
                  entry_point='gym_bs.envs:EuropeanOptionEnv',
                  kwargs={'t': num_steps,
                          'n': 1,
                          's0': 49,
                          'k': 50,
                          'max_stock': 1,
                          'sigma': .1})

params = dict(n_iter=10000, batch_size=50, elite_frac=0.3)

env = gym.make('obs-v2')
env = gym.wrappers.Monitor(env, "/tmp/gym-results/obs-v2", video_callable=False, write_upon_reset=True, force=True)









    



[2017-08-02 22:10:50,136] Making new env: obs-v2
[2017-08-02 22:10:50,371] Clearing 2 monitor files from previous run (because force=True was provided)



In [4]:

    
observation = env.reset()

Random action



In [5]:

    
%%time
df = pd.DataFrame.from_dict({'reward': [], 'observation': []})
for _ in range(10):
    observation = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict({'reward': reward, 'observation': [observation]}))









    



CPU times: user 61.1 ms, sys: 6.48 ms, total: 67.5 ms
Wall time: 66.7 ms



In [6]:

    
%matplotlib inline
df.reward.clip_lower(-15).hist(bins=100)









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a1f3cf8>

No action



In [7]:

    
%%time
df = pd.DataFrame.from_dict({'reward': [], 'underlying': [], 'tau': [], 'stocks': []})
action = np.array([0.])
for _ in range(1000):
    observation = env.reset()
    done = False
    while not done:
#         action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict({'reward': reward,
                                            'underlying': observation[0],
                                            'tau': observation[1],
                                            'stocks': observation[2]}))









    



CPU times: user 7.06 s, sys: 517 ms, total: 7.57 s
Wall time: 7.6 s



In [12]:

    
%matplotlib inline
df.reward.clip_lower(-1500).hist(bins=100)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x11dca45f8>



In [11]:

    
%matplotlib inline
# fig = plt.Figure()
df.underlying.hist(bins=20, figsize=(10, 6))









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x11d9f2908>



In [ ]:



In [10]:

    
done = False
df = pd.DataFrame.from_dict({'reward': [], 'observation': []})
while not done:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    df = df.append( pd.DataFrame.from_dict('reward': reward, 'observation': observation))









    



  File "<ipython-input-10-cb69bf7ed326>", line 6
    df = df.append( pd.DataFrame.from_dict('reward': reward, 'observation': observation))
                                                   ^
SyntaxError: invalid syntax

Different reward functions

Black Scholes - three different reward functions

$r_T = U(V_T)$ where $U$ is a utility function
$r_T = - (V_T - E[V_0])^2$
$V_T$ with hedging costs and utility function as in first vase

Portfolio:

1 option
0 cash
0 stocks

Variations:

can only hedge once / X times

Interesting

how to learn the value function?