In [1]:
import os
import gym
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
from omstang_lib.trade_gym import trade_gym
In [2]:
env = trade_gym.MarketEnv(os.path.join('Data','daily_sp500_1998-2013'), filename="table_goog.csv", scale=False)
In [3]:
observation = env.reset()
done = False
navs = []
# env.info()
while not done:
action = 1 # stay flat
observation, reward, done, info = env.step(action)
navs.append((info['nav'],info['base']))
if done:
print('Annualized return: NAV[{}] BASE[{}]'.format(navs[len(navs)-1][0]-1,navs[len(navs)-1][1]-1))
data_plot = pd.DataFrame(navs)
data_plot.columns = ['nav', 'base']
data_plot.plot()
In [4]:
stayflat = lambda o,e: 1 # stand pat
buyandhold = lambda o,e: 2 # buy on day #1 and hold
randomtrader = lambda o,e: e.action_space.sample() # retail trader
# to run singly, we call run_strat. we are returned a dataframe containing
# all steps in the sim.
bhdf = env.run_strategy(buyandhold)
print(bhdf.head())
# we can easily plot our nav in time:
bhdf.loc[:,['navs','mkt_nav']].plot(title='buy & hold nav')
Out[4]:
In [5]:
env.run_strategy(buyandhold).navs.plot(title='same strategy, different results')
env.run_strategy(buyandhold).navs.plot()
env.run_strategy(buyandhold).navs.plot()
Out[5]:
In [6]:
import time
from tensorforce import Configuration
from tensorforce.agents import Agent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym
from tensorforce.agents import DDQNAgent
class OpenAIGymCustom(OpenAIGym):
def __init__(self, env):
self.gym = env
env_train = trade_gym.MarketEnv(os.path.join('Data','daily_sp500_1998-2013'), filename="table_goog.csv", scale=True)
environment = OpenAIGymCustom(env_train)
config_dict = {
"preprocessing": None,
"exploration": {
"type": "epsilon_decay",
"epsilon": 1.0,
"epsilon_final": 0.1,
"epsilon_timesteps": 1e6
},
"reward_preprocessing": [
{
"type": "clip",
"min": -1,
"max": 1
}
],
"batch_size": 32,
"memory_capacity": 10000,
"memory": {
"type": "replay",
"random_sampling": True
},
"update_frequency": 4,
"first_update": 50000,
"repeat_update": 1,
"target_update_frequency": 10000,
"discount": 0.97,
"learning_rate": 0.00025,
"optimizer": {
"learning_rate": 0.00025,
"type": "rmsprop",
"momentum": 0.95,
"epsilon": 0.01
},
"tf_summary": None,
"log_level": "info",
"update_target_weight": 1.0,
"double_dqn": False,
"clip_loss": 0.0,
"huber_loss": None
}
config = Configuration(**config_dict)
# Create a Proximal Policy Optimization agent
agent = DDQNAgent(
states_spec=dict(type='float', shape=(env_train.src.data.shape[1],)),
actions_spec=dict(type='int', num_actions=len(env_train.actions)),
network_spec=[
dict(type='dense', size=64),
dict(type='dense', size=64)
],
config=config
)
runner = Runner(
agent=agent,
environment=environment,
repeat_actions=1
)
report_episodes = 1000
def episode_finished(r):
if r.episode % report_episodes == 0:
steps_per_second = r.timestep / (time.time() - r.start_time)
print("=============================")
print("Finished episode {} after {} timesteps. Steps Per Second {}".format(
r.episode, r.episode_timestep, steps_per_second
))
print("Episode reward: {}".format(r.episode_rewards[-1]))
return True
runner.run(
episodes=10000,
max_episode_timesteps=300,
episode_finished=episode_finished
)
print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
print("Average of last 500 rewards: {}".format(sum(runner.episode_rewards[-500:]) / 500))
print("Average of last 100 rewards: {}".format(sum(runner.episode_rewards[-100:]) / 100))
In [ ]: