In this notebook a Q learner with dyna will be trained and evaluated. The Q learner recommends when to buy or sell shares of one particular stock, and in which quantity (in fact it determines the desired fraction of shares in the total portfolio value).



In [1]:

    
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
from multiprocessing import Pool

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

import recommender.simulator as sim
from utils.analysis import value_eval
from recommender.agent import Agent
from functools import partial









    



Populating the interactive namespace from numpy and matplotlib



In [11]:

    
NUM_THREADS = 1
LOOKBACK = 252*2 + 28
STARTING_DAYS_AHEAD = 20
POSSIBLE_FRACTIONS = [0.0, 1.0]
DYNA = 20

# Get the data
SYMBOL = 'SPY'
total_data_train_df = pd.read_pickle('../../data/data_train_val_df.pkl').stack(level='feature')
data_train_df = total_data_train_df[SYMBOL].unstack()
total_data_test_df = pd.read_pickle('../../data/data_test_df.pkl').stack(level='feature')
data_test_df = total_data_test_df[SYMBOL].unstack()
if LOOKBACK == -1:
    total_data_in_df = total_data_train_df
    data_in_df = data_train_df
else:
    data_in_df = data_train_df.iloc[-LOOKBACK:]
    total_data_in_df = total_data_train_df.loc[data_in_df.index[0]:]

# Create many agents
index = np.arange(NUM_THREADS).tolist()
env, num_states, num_actions = sim.initialize_env(total_data_in_df, 
                                                  SYMBOL, 
                                                  starting_days_ahead=STARTING_DAYS_AHEAD,
                                                  possible_fractions=POSSIBLE_FRACTIONS)
agents = [Agent(num_states=num_states, 
                num_actions=num_actions, 
                random_actions_rate=0.98, 
                random_actions_decrease=0.999,
                dyna_iterations=DYNA,
                name='Agent_{}'.format(i)) for i in index]



In [3]:

    
def show_results(results_list, data_in_df, graph=False):
    for values in results_list:
        total_value = values.sum(axis=1)
        print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(total_value))))
        print('-'*100)
        initial_date = total_value.index[0]
        compare_results = data_in_df.loc[initial_date:, 'Close'].copy()
        compare_results.name = SYMBOL
        compare_results_df = pd.DataFrame(compare_results)
        compare_results_df['portfolio'] = total_value
        std_comp_df = compare_results_df / compare_results_df.iloc[0]
        if graph:
            plt.figure()
            std_comp_df.plot()

Let's show the symbols data, to see how good the recommender has to be.



In [4]:

    
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_in_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))









    



Sharpe ratio: 0.8869740266324404
Cum. Ret.: 0.6466912353789458
AVG_DRET: 0.0005379574180719433
STD_DRET: 0.009628026310294057
Final value: 205.54



In [5]:

    
# Simulate (with new envs, each time)
n_epochs = 4

for i in range(n_epochs):
    tic = time()
    env.reset(STARTING_DAYS_AHEAD)
    results_list = sim.simulate_period(total_data_in_df, 
                                       SYMBOL,
                                       agents[0],
                                       starting_days_ahead=STARTING_DAYS_AHEAD,
                                       possible_fractions=POSSIBLE_FRACTIONS,
                                       verbose=False,
                                       other_env=env)
    toc = time()
    print('Epoch: {}'.format(i))
    print('Elapsed time: {} seconds.'.format((toc-tic)))
    print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
    show_results([results_list], data_in_df)









    



Starting simulation for agent: Agent_0. 1036 days of simulation to go.
Date 2014-12-18 00:00:00 (simulating until 2014-12-31 00:00:00).  Time: 0.6086065769195557s.  Value: 14547.709999999935..Epoch: 0
Elapsed time: 284.6141257286072 seconds.
Random Actions Rate: 0.003998027990937788
Sharpe ratio: 0.24676776055735977
Cum. Ret.: 0.4547709999999936
AVG_DRET: 8.221453577492392e-05
STD_DRET: 0.005288840373404707
Final value: 14547.709999999935
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0. 1036 days of simulation to go.
Date 2014-12-18 00:00:00 (simulating until 2014-12-31 00:00:00).  Time: 0.5835316181182861s.  Value: 18871.720000000012..Epoch: 1
Elapsed time: 274.6330382823944 seconds.
Random Actions Rate: 1.6310436547267375e-05
Sharpe ratio: 0.9931526333183366
Cum. Ret.: 0.8871720000000012
AVG_DRET: 0.00011724670971876491
STD_DRET: 0.001874066234431467
Final value: 18871.720000000012
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0. 1036 days of simulation to go.
Date 2014-12-18 00:00:00 (simulating until 2014-12-31 00:00:00).  Time: 0.6539068222045898s.  Value: 18098.120000000014..Epoch: 2
Elapsed time: 275.2099859714508 seconds.
Random Actions Rate: 6.654038965345852e-08
Sharpe ratio: 1.0030289904718677
Cum. Ret.: 0.8098120000000013
AVG_DRET: 0.00010937916417306911
STD_DRET: 0.0017310969259895783
Final value: 18098.120000000014
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0. 1036 days of simulation to go.
Date 2014-12-18 00:00:00 (simulating until 2014-12-31 00:00:00).  Time: 0.5924780368804932s.  Value: 18511.55.000000002..Epoch: 3
Elapsed time: 287.39360070228577 seconds.
Random Actions Rate: 2.714595309820745e-10
Sharpe ratio: 1.0712387802473649
Cum. Ret.: 0.8511549999999999
AVG_DRET: 0.00011339995809055928
STD_DRET: 0.00168045496480338
Final value: 18511.55
----------------------------------------------------------------------------------------------------



In [6]:

    
env.reset(STARTING_DAYS_AHEAD)
results_list = sim.simulate_period(total_data_in_df, 
                                   SYMBOL, agents[0], 
                                   learn=False, 
                                   starting_days_ahead=STARTING_DAYS_AHEAD,
                                   possible_fractions=POSSIBLE_FRACTIONS,
                                   other_env=env)
show_results([results_list], data_in_df, graph=True)









    



Starting simulation for agent: Agent_0. 1036 days of simulation to go.
Date 2014-12-18 00:00:00 (simulating until 2014-12-31 00:00:00).  Time: 0.6854684352874756s.  Value: 18511.55.000000002..Sharpe ratio: 1.0712387802473649
Cum. Ret.: 0.8511549999999999
AVG_DRET: 0.00011339995809055928
STD_DRET: 0.00168045496480338
Final value: 18511.55
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f04d281b9e8>

Let's run the trained agent, with the test set

First a non-learning test: this scenario would be worse than what is possible (in fact, the q-learner can learn from past samples in the test set without compromising the causality).



In [12]:

    
TEST_DAYS_AHEAD = 20

env.set_test_data(total_data_test_df, TEST_DAYS_AHEAD)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=False,
                                    starting_days_ahead=TEST_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False,
                                    other_env=env)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)









    



Starting simulation for agent: Agent_0. 504 days of simulation to go.
Date 2016-12-28 00:00:00 (simulating until 2016-12-30 00:00:00).  Time: 0.46126818656921387s.  Value: 9390.339999999998.Epoch: 3
Elapsed time: 18.92051362991333 seconds.
Random Actions Rate: 0.98
Sharpe ratio: -0.28109471430005456
Cum. Ret.: -0.06432800000000027
AVG_DRET: -0.00011630382002899302
STD_DRET: 0.006568127438961283
Final value: 9356.719999999998
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f04d2701748>

And now a "realistic" test, in which the learner continues to learn from past samples in the test set (it even makes some random moves, though very few).



In [13]:

    
env.set_test_data(total_data_test_df, TEST_DAYS_AHEAD)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=True,
                                    starting_days_ahead=TEST_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False,
                                    other_env=env)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)









    



Starting simulation for agent: Agent_0. 504 days of simulation to go.
Date 2016-12-28 00:00:00 (simulating until 2016-12-30 00:00:00).  Time: 0.5135505199432373s.  Value: 9709.180000000008.Epoch: 3
Elapsed time: 20.417168140411377 seconds.
Random Actions Rate: 0.6044451383651991
Sharpe ratio: -0.11302234275553102
Cum. Ret.: -0.02929699999999935
AVG_DRET: -4.3242017535291385e-05
STD_DRET: 0.0060735402468805955
Final value: 9707.030000000006
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f04cdb95278>

What are the metrics for "holding the position"?



In [9]:

    
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_test_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))









    



Sharpe ratio: 0.44271542660031676
Cum. Ret.: 0.1070225832012679
AVG_DRET: 0.00025103195406808796
STD_DRET: 0.009001287260690292
Final value: 223.53

Conclusion:



In [10]:

    
import pickle
with open('../../data/simple_q_learner_fast_learner_full_training.pkl', 'wb') as best_agent:
    pickle.dump(agents[0], best_agent)



In [ ]: