In this notebook a simple Q learner will be trained and evaluated. The Q learner recommends when to buy or sell shares of one particular stock, and in which quantity (in fact it determines the desired fraction of shares in the total portfolio value). One initial attempt was made to train the Q-learner with multiple processes, but it was unsuccessful.



In [1]:

    
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
from multiprocessing import Pool

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

import recommender.simulator as sim
from utils.analysis import value_eval
from recommender.agent import Agent
from functools import partial









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
NUM_THREADS = 1
LOOKBACK = 252*2 + 28
STARTING_DAYS_AHEAD = 20
POSSIBLE_FRACTIONS = [0.0, 1.0]

# Get the data
SYMBOL = 'SPY'
total_data_train_df = pd.read_pickle('../../data/data_train_val_df.pkl').stack(level='feature')
data_train_df = total_data_train_df[SYMBOL].unstack()
total_data_test_df = pd.read_pickle('../../data/data_test_df.pkl').stack(level='feature')
data_test_df = total_data_test_df[SYMBOL].unstack()
if LOOKBACK == -1:
    total_data_in_df = total_data_train_df
    data_in_df = data_train_df
else:
    data_in_df = data_train_df.iloc[-LOOKBACK:]
    total_data_in_df = total_data_train_df.loc[data_in_df.index[0]:]

# Create many agents
index = np.arange(NUM_THREADS).tolist()
env, num_states, num_actions = sim.initialize_env(total_data_in_df, 
                                                  SYMBOL, 
                                                  starting_days_ahead=STARTING_DAYS_AHEAD,
                                                  possible_fractions=POSSIBLE_FRACTIONS)
agents = [Agent(num_states=num_states, 
                num_actions=num_actions, 
                random_actions_rate=0.98, 
                random_actions_decrease=0.999,
                dyna_iterations=0,
                name='Agent_{}'.format(i)) for i in index]



In [3]:

    
def show_results(results_list, data_in_df, graph=False):
    for values in results_list:
        total_value = values.sum(axis=1)
        print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(total_value))))
        print('-'*100)
        initial_date = total_value.index[0]
        compare_results = data_in_df.loc[initial_date:, 'Close'].copy()
        compare_results.name = SYMBOL
        compare_results_df = pd.DataFrame(compare_results)
        compare_results_df['portfolio'] = total_value
        std_comp_df = compare_results_df / compare_results_df.iloc[0]
        if graph:
            plt.figure()
            std_comp_df.plot()

Let's show the symbols data, to see how good the recommender has to be.



In [4]:

    
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_in_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))









    



Sharpe ratio: 1.601691549431671
Cum. Ret.: 0.4244923418116293
AVG_DRET: 0.0007179294312480581
STD_DRET: 0.00711546265440581
Final value: 205.54



In [5]:

    
# Simulate (with new envs, each time)
n_epochs = 4

for i in range(n_epochs):
    tic = time()
    env.reset(STARTING_DAYS_AHEAD)
    results_list = sim.simulate_period(total_data_in_df, 
                                       SYMBOL,
                                       agents[0],
                                       starting_days_ahead=STARTING_DAYS_AHEAD,
                                       possible_fractions=POSSIBLE_FRACTIONS,
                                       verbose=False,
                                       other_env=env)
    toc = time()
    print('Epoch: {}'.format(i))
    print('Elapsed time: {} seconds.'.format((toc-tic)))
    print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
    show_results([results_list], data_in_df)









    



Starting simulation for agent: Agent_0
Epoch: 0
Elapsed time: 61.33071255683899 seconds.
Random Actions Rate: 0.06845808665396963
Sharpe ratio: 1.6166918533402772
Cum. Ret.: 0.30129899999999843
AVG_DRET: 0.0005300292047664219
STD_DRET: 0.005204425792766552
Final value: 13012.989999999985
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0
Epoch: 1
Elapsed time: 62.5613374710083 seconds.
Random Actions Rate: 0.004782152681961647
Sharpe ratio: 3.0987464363932657
Cum. Ret.: 0.5437169999999996
AVG_DRET: 0.0008614046111994861
STD_DRET: 0.004412872933399741
Final value: 15437.169999999995
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0
Epoch: 2
Elapsed time: 58.34930729866028 seconds.
Random Actions Rate: 0.00033405818642269795
Sharpe ratio: 3.540663107916942
Cum. Ret.: 0.6325260000000001
AVG_DRET: 0.0009709191680926959
STD_DRET: 0.00435310095926684
Final value: 16325.260000000002
----------------------------------------------------------------------------------------------------
Starting simulation for agent: Agent_0
Epoch: 3
Elapsed time: 59.530198097229004 seconds.
Random Actions Rate: 2.3335698238360173e-05
Sharpe ratio: 3.6713911941383337
Cum. Ret.: 0.649601000000001
AVG_DRET: 0.000991045870588251
STD_DRET: 0.004285123713790579
Final value: 16496.01000000001
----------------------------------------------------------------------------------------------------



In [8]:

    
env.reset(STARTING_DAYS_AHEAD)
results_list = sim.simulate_period(total_data_in_df, 
                                   SYMBOL, agents[0], 
                                   learn=False, 
                                   starting_days_ahead=STARTING_DAYS_AHEAD,
                                   possible_fractions=POSSIBLE_FRACTIONS,
                                   other_env=env)
show_results([results_list], data_in_df, graph=True)









    



Starting simulation for agent: Agent_0
Sharpe ratio: 3.6713911941383337
Cum. Ret.: 0.649601000000001
AVG_DRET: 0.000991045870588251
STD_DRET: 0.004285123713790579
Final value: 16496.01000000001
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f5ca02be550>

Let's run the trained agent, with the test set

First a non-learning test: this scenario would be worse than what is possible (in fact, the q-learner can learn from past samples in the test set without compromising the causality).



In [15]:

    
TEST_DAYS_AHEAD = 20

env.set_test_data(total_data_test_df, TEST_DAYS_AHEAD)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=False,
                                    starting_days_ahead=TEST_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False,
                                    other_env=env)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)









    



Starting simulation for agent: Agent_0
Epoch: 3
Elapsed time: 55.592562198638916 seconds.
Random Actions Rate: 1.8752152647281067e-06
Sharpe ratio: 0.7870295592337418
Cum. Ret.: 0.12176800000000121
AVG_DRET: 0.0002512456865369278
STD_DRET: 0.005067664334246275
Final value: 11217.680000000011
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f5ca4856518>

And now a "realistic" test, in which the learner continues to learn from past samples in the test set (it even makes some random moves, though very few).



In [16]:

    
env.set_test_data(total_data_test_df, TEST_DAYS_AHEAD)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=True,
                                    starting_days_ahead=TEST_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False,
                                    other_env=env)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)









    



Starting simulation for agent: Agent_0
Epoch: 3
Elapsed time: 57.57186007499695 seconds.
Random Actions Rate: 1.5068896817018496e-07
Sharpe ratio: 0.6240379120840598
Cum. Ret.: 0.09198600000000079
AVG_DRET: 0.00019485477359206172
STD_DRET: 0.00495678800324824
Final value: 10919.860000000008
----------------------------------------------------------------------------------------------------






    





<matplotlib.figure.Figure at 0x7f5ca02beba8>

What are the metrics for "holding the position"?



In [19]:

    
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_test_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))









    



Sharpe ratio: 0.44271542660031676
Cum. Ret.: 0.1070225832012679
AVG_DRET: 0.00025103195406808796
STD_DRET: 0.009001287260690292
Final value: 223.53

Conclusion: Sharpe ratio is clearly better than the benchmark. Cumulative return is similar (a bit better with the non-learner, a bit worse with the learner)



In [18]:

    
import pickle
with open('../../data/simple_q_learner_fast_learner.pkl', 'wb') as best_agent:
    pickle.dump(agents[0], best_agent)



In [ ]:



In [ ]: