In this notebook a simple Q learner will be trained and evaluated. The Q learner recommends when to buy or sell shares of one particular stock, and in which quantity (in fact it determines the desired fraction of shares in the total portfolio value). One initial attempt was made to train the Q-learner with multiple processes, but it was unsuccessful.


In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
from multiprocessing import Pool

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

import recommender.simulator as sim
from utils.analysis import value_eval
from recommender.agent import Agent
from functools import partial


Populating the interactive namespace from numpy and matplotlib

In [2]:
NUM_THREADS = 1
LOOKBACK = 252*2 + 28
STARTING_DAYS_AHEAD = 20
POSSIBLE_FRACTIONS = [0.0, 1.0]

# Get the data
SYMBOL = 'SPY'
total_data_train_df = pd.read_pickle('../../data/data_train_val_df.pkl').stack(level='feature')
data_train_df = total_data_train_df[SYMBOL].unstack()
total_data_test_df = pd.read_pickle('../../data/data_test_df.pkl').stack(level='feature')
data_test_df = total_data_test_df[SYMBOL].unstack()
if LOOKBACK == -1:
    total_data_in_df = total_data_train_df
    data_in_df = data_train_df
else:
    data_in_df = data_train_df.iloc[-LOOKBACK:]
    total_data_in_df = total_data_train_df.loc[data_in_df.index[0]:]

# Create many agents
index = np.arange(NUM_THREADS).tolist()
env, num_states, num_actions = sim.initialize_env(total_data_train_df, 
                                                  SYMBOL, 
                                                  starting_days_ahead=STARTING_DAYS_AHEAD,
                                                  possible_fractions=POSSIBLE_FRACTIONS)
agents = [Agent(num_states=num_states, 
                num_actions=num_actions, 
                random_actions_rate=0.98, 
                random_actions_decrease=0.9999,
                dyna_iterations=0,
                name='Agent_{}'.format(i)) for i in index]

In [3]:
def show_results(results_list, data_in_df, graph=False):
    for values in results_list:
        total_value = values.sum(axis=1)
        print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(total_value))))
        print('-'*100)
        initial_date = total_value.index[0]
        compare_results = data_in_df.loc[initial_date:, 'Close'].copy()
        compare_results.name = SYMBOL
        compare_results_df = pd.DataFrame(compare_results)
        compare_results_df['portfolio'] = total_value
        std_comp_df = compare_results_df / compare_results_df.iloc[0]
        if graph:
            plt.figure()
            std_comp_df.plot()

Let's show the symbols data, to see how good the recommender has to be.


In [4]:
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_in_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))


Sharpe ratio: 1.601691549431671
Cum. Ret.: 0.4244923418116293
AVG_DRET: 0.0007179294312480581
STD_DRET: 0.00711546265440581
Final value: 205.54

In [5]:
# Simulate (with new envs, each time)
n_epochs = 15

for i in range(n_epochs):
    tic = time()
    results_list = sim.simulate_period(total_data_in_df, 
                                       SYMBOL,
                                       agents[0],
                                       starting_days_ahead=STARTING_DAYS_AHEAD,
                                       possible_fractions=POSSIBLE_FRACTIONS,
                                       verbose=False)
    toc = time()
    print('Epoch: {}'.format(i))
    print('Elapsed time: {} seconds.'.format((toc-tic)))
    print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
    show_results([results_list], data_in_df)


Starting simulation for agent: Agent_0
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-3c55bc663feb> in <module>()
      9                                        starting_days_ahead=STARTING_DAYS_AHEAD,
     10                                        possible_fractions=POSSIBLE_FRACTIONS,
---> 11                                        verbose=False)
     12     toc = time()
     13     print('Epoch: {}'.format(i))

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/simulator.py in simulate_period(data_df, symbol, agent, other_env, verbose, learn, starting_days_ahead, possible_fractions)
     85     recorded_cash_value = {}
     86     for i in range(N_iters):
---> 87         reward, new_state = env.get_consequences_from_fraction_index(fraction_index)
     88 
     89         if verbose:

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/environment.py in get_consequences_from_fraction_index(self, fraction_index)
    110     def get_consequences_from_fraction_index(self, fraction_index):
    111         target_fraction = self.actions_fractions.interval_to_value(fraction_index)
--> 112         return self.act_to_target(target_fraction)
    113 
    114     def reward_final_value(self, old_pos_df, new_pos_df):

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/environment.py in act_to_target(self, target_fraction)
    106         shares_increase = wanted_shares - previous_shares
    107         action = [Order([self.symbol, Order.BUY, shares_increase])]
--> 108         return self.get_consequences(action)
    109 
    110     def get_consequences_from_fraction_index(self, fraction_index):

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/environment.py in get_consequences(self, action)
     86         new_positions_df = self.portfolio.get_positions()
     87         reward = self.reward_fun(old_positions_df, new_positions_df)
---> 88         new_state = self.vector_to_state(self.extract_indicators(self.data_df[:self.portfolio.current_date]))
     89         return reward, new_state
     90 

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/environment.py in extract_indicators(self, data_df)
     67     def extract_indicators(self, data_df):
     68         """ Returns a vector state with the quantized index of all the indicators. """
---> 69         return tuple(map(lambda x: x.extract(data_df[self.symbol].unstack()), self.indicators.values()))
     70 
     71     def initialize_states(self):

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/recommender/environment.py in <lambda>(x)
     67     def extract_indicators(self, data_df):
     68         """ Returns a vector state with the quantized index of all the indicators. """
---> 69         return tuple(map(lambda x: x.extract(data_df[self.symbol].unstack()), self.indicators.values()))
     70 
     71     def initialize_states(self):

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/pandas/core/series.py in unstack(self, level, fill_value)
   2026         """
   2027         from pandas.core.reshape.reshape import unstack
-> 2028         return unstack(self, level, fill_value)
   2029 
   2030     # ----------------------------------------------------------------------

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in unstack(obj, level, fill_value)
    457         unstacker = _Unstacker(obj.values, obj.index, level=level,
    458                                fill_value=fill_value)
--> 459         return unstacker.get_result()
    460 
    461 

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in get_result(self)
    159         values, value_mask = self.get_new_values()
    160         columns = self.get_new_columns()
--> 161         index = self.get_new_index()
    162 
    163         # filter out missing levels

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in get_new_index(self)
    280             if (lab == -1).any():
    281                 lev = lev.insert(len(lev), _get_na_value(lev.dtype.type))
--> 282             return lev.take(lab)
    283 
    284         return MultiIndex(levels=self.new_index_levels, labels=result_labels,

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py in take(self, indices, axis, allow_fill, fill_value, **kwargs)
    392     def take(self, indices, axis=0, allow_fill=True,
    393              fill_value=None, **kwargs):
--> 394         nv.validate_take(tuple(), kwargs)
    395         indices = _ensure_int64(indices)
    396 

KeyboardInterrupt: 

In [ ]:
results_list = sim.simulate_period(total_data_in_df, 
                                   SYMBOL, agents[0], 
                                   learn=False, 
                                   starting_days_ahead=STARTING_DAYS_AHEAD,
                                   possible_fractions=POSSIBLE_FRACTIONS,)
show_results([results_list], data_in_df, graph=True)

Let's run the trained agent, with the test set

First a non-learning test: this scenario would be worse than what is possible (in fact, the q-learner can learn from past samples in the test set without compromising the causality).


In [ ]:
env, num_states, num_actions = sim.initialize_env(total_data_test_df, 
                                                  SYMBOL,
                                                  starting_days_ahead=STARTING_DAYS_AHEAD,
                                                  possible_fractions=POSSIBLE_FRACTIONS)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=False,
                                    starting_days_ahead=STARTING_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)

And now a "realistic" test, in which the learner continues to learn from past samples in the test set (it even makes some random moves, though very few).


In [ ]:
env, num_states, num_actions = sim.initialize_env(total_data_test_df, 
                                                  SYMBOL,
                                                  starting_days_ahead=STARTING_DAYS_AHEAD,
                                                  possible_fractions=POSSIBLE_FRACTIONS)
tic = time()
results_list = sim.simulate_period(total_data_test_df, 
                                    SYMBOL,
                                    agents[0],
                                    learn=True,
                                    starting_days_ahead=STARTING_DAYS_AHEAD,
                                    possible_fractions=POSSIBLE_FRACTIONS,
                                    verbose=False)
toc = time()
print('Epoch: {}'.format(i))
print('Elapsed time: {} seconds.'.format((toc-tic)))
print('Random Actions Rate: {}'.format(agents[0].random_actions_rate))
show_results([results_list], data_test_df, graph=True)

What are the metrics for "holding the position"?


In [ ]:
print('Sharpe ratio: {}\nCum. Ret.: {}\nAVG_DRET: {}\nSTD_DRET: {}\nFinal value: {}'.format(*value_eval(pd.DataFrame(data_test_df['Close'].iloc[STARTING_DAYS_AHEAD:]))))

In [ ]:
import pickle
with open('../../data/simple_q_learner.pkl', 'wb') as best_agent:
    pickle.dump(agents[0], best_agent)

In [ ]:


In [ ]:


In [ ]: