In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp
import utils.misc as misc


Populating the interactive namespace from numpy and matplotlib

In the previous notebook some hyperparameter exploration was made for the Random Forest Regressor. Let's see which are now the best predictors for each number of "ahead days".


In [24]:
best_raw_params = pd.read_pickle('../../data/best_dataset_params_raw_df.pkl')

def keep_max_r2(record):
    return record.loc[np.argmax(record['r2']),:]

best_ini_pred_df = best_raw_params.groupby('ahead_days').apply(keep_max_r2)
best_ini_pred_df


Out[24]:
GOOD_DATA_RATIO SAMPLES_GOOD_DATA_RATIO ahead_days base_days model mre r2 step_days train_days train_val_time x_filename y_filename
ahead_days
1.0 0.99 0.9 1.0 112.0 linear 0.015856 0.986599 7.0 504.0 -1.0 x_base112_ahead1.pkl y_base112_ahead1.pkl
7.0 0.99 0.9 7.0 112.0 linear 0.042367 0.923348 7.0 756.0 -1.0 x_base112_ahead7.pkl y_base112_ahead7.pkl
14.0 0.99 0.9 14.0 112.0 linear 0.060167 0.865259 7.0 756.0 -1.0 x_base112_ahead14.pkl y_base112_ahead14.pkl
28.0 0.99 0.9 28.0 112.0 linear 0.091966 0.758046 7.0 756.0 -1.0 x_base112_ahead28.pkl y_base112_ahead28.pkl
56.0 0.99 0.9 56.0 112.0 linear 0.127913 0.590426 7.0 756.0 -1.0 x_base112_ahead56.pkl y_base112_ahead56.pkl

Those were the best predictors before the hyperparameters tuning


In [25]:
hyper1_df = pd.read_pickle('../../data/hyper_ahead1_random_forest_df.pkl')
hyper1_df


Out[25]:
n_estimators max_depth n_jobs scores r2 mre
0 50 5 -1 (0.983814083021, 0.0191439772135) 0.983814 0.019144
1 50 10 -1 (0.986125708821, 0.0168504555546) 0.986126 0.016850
2 100 5 -1 (0.984182203771, 0.0190912449575) 0.984182 0.019091
3 100 10 -1 (0.98606697491, 0.0168202507896) 0.986067 0.016820

In [26]:
from sklearn.metrics import r2_score

hyper_1_best_series = hyper1_df.iloc[np.argmax(hyper1_df['r2'])].copy()
hyper_1_best_series.name = 1
hyper_1_best_series


Out[26]:
n_estimators                                   50
max_depth                                      10
n_jobs                                         -1
scores          (0.986125708821, 0.0168504555546)
r2                                       0.986126
mre                                     0.0168505
Name: 1, dtype: object

In [27]:
ahead_days = [1, 7, 14, 28, 56]
best_hyper_df = pd.DataFrame()
best_hyper_df.index.name = 'ahead_days'

for ahead in ahead_days:
    hyper_df = pd.read_pickle('../../data/hyper_ahead{}_random_forest_df.pkl'.format(ahead))
    hyper_best_series = hyper_df.iloc[np.argmax(hyper_df['r2'])].copy()
    hyper_best_series.name = ahead
    best_hyper_df = best_hyper_df.append(hyper_best_series)
best_hyper_df


Out[27]:
max_depth mre n_estimators n_jobs r2 scores
ahead_days
1 10.0 0.016850 50.0 -1.0 0.986126 (0.986125708821, 0.0168504555546)
7 5.0 0.042002 100.0 -1.0 0.929017 (0.929016577116, 0.0420024748489)
14 5.0 0.059515 100.0 -1.0 0.868927 (0.868927025792, 0.0595149788733)
28 5.0 0.090310 50.0 -1.0 0.765112 (0.76511197092, 0.0903104492391)
56 5.0 0.126410 50.0 -1.0 0.615412 (0.615412026805, 0.126409538141)

Let's compare the new best Random Forest predictors with the old Linear Regressors.


In [28]:
def join_and_compare(df1, df2, column, labels):
    tj1 = pd.DataFrame(df1[column].copy())
    tj1.rename(columns = {column: labels[0]}, inplace=True)
    tj2 = pd.DataFrame(df2[column].copy())
    tj2.rename(columns = {column: labels[1]}, inplace=True)
    comp_df = tj1.join(tj2)
    comp_df['diff'] = comp_df[labels[1]] - comp_df[labels[0]]
    
    return comp_df

First the $r^2$ metrics


In [29]:
comp_r2_df = join_and_compare(best_ini_pred_df, best_hyper_df, 'r2', ['linear', 'random_forest'])
comp_r2_df['best'] = comp_r2_df.apply(lambda x: np.argmax(x), axis=1)
comp_r2_df


Out[29]:
linear random_forest diff best
ahead_days
1.0 0.986599 0.986126 -0.000474 linear
7.0 0.923348 0.929017 0.005669 random_forest
14.0 0.865259 0.868927 0.003668 random_forest
28.0 0.758046 0.765112 0.007066 random_forest
56.0 0.590426 0.615412 0.024986 random_forest

The values are very similar in both cases. A minor difference can be seen only in the case of 56 days ahead, in which the random forest seems to be a bit better than the linear predictor. In any case, as the linear predictor is much simpler, and faster, it's probably better to keep it as the best predictor. It can be seen that this scenario is different from the one before hyperparameter tuning, in which the linear predictor was always better.

And then the MRE metrics


In [30]:
comp_mre_df = join_and_compare(best_ini_pred_df, best_hyper_df, 'mre', ['linear', 'random_forest'])
comp_mre_df['best'] = comp_mre_df.apply(lambda x: np.argmax(x), axis=1)
comp_mre_df


Out[30]:
linear random_forest diff best
ahead_days
1.0 0.015856 0.016850 0.000994 random_forest
7.0 0.042367 0.042002 -0.000365 linear
14.0 0.060167 0.059515 -0.000652 linear
28.0 0.091966 0.090310 -0.001656 linear
56.0 0.127913 0.126410 -0.001504 linear

The values for the MRE metrics are almost the same for both predictors.

Conlcusion: The linear predictor will be chosen for all the predictions. In the case of the 56 ahead days prediction, a better $r^2$ metric could be achieved by the random forest predictor, but the linear predictor is still far simpler and faster.

Testing the chosen predictor

Let's get the test data


In [31]:
data_test_df = pd.read_pickle('../../data/data_test_df.pkl')
data_test_df.head()


Out[31]:
feature Close ... Volume
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... XEL XRX XLNX XL XYL YHOO YUM ZBH ZION ZTS
date
2015-01-02 205.43 164.06 44.90 65.89 88.84 20.13 139.88 72.34 NaN 158.56 ... 2535289.0 3912022.0 2402443.0 NaN 606118.0 11924473.0 1641557.0 909491.0 2299118.0 1784851.0
2015-01-05 201.72 160.36 44.91 64.65 87.34 19.85 136.52 71.98 2.66 156.47 ... 3107187.0 7032861.0 2611059.0 NaN 1369903.0 14389308.0 3176619.0 2163761.0 5326879.0 3116681.0
2015-01-06 199.82 158.65 44.40 64.33 86.71 19.48 134.81 70.53 2.63 156.36 ... 4749648.0 7170289.0 3430462.0 NaN 1336249.0 16204304.0 3597727.0 1782098.0 9096223.0 3987015.0
2015-01-07 202.31 159.80 44.76 66.93 88.53 19.06 137.20 71.11 2.58 159.72 ... 2833770.0 4836408.0 2110610.0 NaN 1039030.0 11788031.0 3273992.0 1462026.0 2759850.0 2481935.0
2015-01-08 205.90 163.63 45.68 67.63 89.88 19.25 142.00 72.92 2.61 161.12 ... 2516764.0 6229982.0 2824232.0 NaN 821836.0 14704771.0 3061324.0 1408433.0 1831484.0 3121258.0

5 rows × 2495 columns

When generating the datasets, some symbols were removed from the training set (because they contained too many missing points). The same symbols should be removed from the test set.

Let's generate datasets for the test set, with the best parameters found.


In [33]:
best_ini_pred_df


Out[33]:
GOOD_DATA_RATIO SAMPLES_GOOD_DATA_RATIO ahead_days base_days model mre r2 step_days train_days train_val_time x_filename y_filename
ahead_days
1.0 0.99 0.9 1.0 112.0 linear 0.015856 0.986599 7.0 504.0 -1.0 x_base112_ahead1.pkl y_base112_ahead1.pkl
7.0 0.99 0.9 7.0 112.0 linear 0.042367 0.923348 7.0 756.0 -1.0 x_base112_ahead7.pkl y_base112_ahead7.pkl
14.0 0.99 0.9 14.0 112.0 linear 0.060167 0.865259 7.0 756.0 -1.0 x_base112_ahead14.pkl y_base112_ahead14.pkl
28.0 0.99 0.9 28.0 112.0 linear 0.091966 0.758046 7.0 756.0 -1.0 x_base112_ahead28.pkl y_base112_ahead28.pkl
56.0 0.99 0.9 56.0 112.0 linear 0.127913 0.590426 7.0 756.0 -1.0 x_base112_ahead56.pkl y_base112_ahead56.pkl

In [34]:
best_ini_pred_df.to_pickle('../../data/best_params_final_df.pkl')

Some playing with the data to remove the same symbols as in the training set


In [54]:
params = best_ini_pred_df.loc[1]

In [55]:
train_val_time = int(params['train_val_time'])
base_days = int(params['base_days'])
step_days = int(params['step_days'])
ahead_days = int(params['ahead_days'])
    
print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
pid = 'base{}_ahead{}'.format(base_days, ahead_days)


Generating: base112_ahead1

In [56]:
y_train_df = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
y_train_df.head()


Out[56]:
1993-01-29  AAPL    0.945761
            ABT     1.416988
            ADBE    0.914591
            ADM     0.947878
            ADP     0.944196
Name: 112, dtype: float64

In [57]:
kept_symbols = y_train_df.index.get_level_values(1).unique().tolist()
len(kept_symbols)


Out[57]:
285

In [58]:
len(data_test_df.columns.get_level_values(1).unique().tolist())


Out[58]:
499

In [59]:
filtered_data_test_df = data_test_df.loc[:, (slice(None), kept_symbols)]

In [60]:
len(filtered_data_test_df.columns.get_level_values(1).unique().tolist())


Out[60]:
285

OK, let's create a function to generate one test dataset


In [62]:
def generate_one_test_set(params, data_df):
    # print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
    tic = time()
    
    train_val_time = int(params['train_val_time'])
    base_days = int(params['base_days'])
    step_days = int(params['step_days'])
    ahead_days = int(params['ahead_days'])
    
    print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
    pid = 'base{}_ahead{}'.format(base_days, ahead_days)
    
    # Getting the data
    today = data_df.index[-1]  # Real date
    print(pid + ') data_df loaded')

    # Drop symbols with many missing points
    y_train_df = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
    kept_symbols = y_train_df.index.get_level_values(1).unique().tolist()
    data_df = data_df.loc[:, (slice(None), kept_symbols)]
    print(pid + ') Irrelevant symbols dropped.')
    
    # Generate the intervals for the predictor
    x, y = fe.generate_train_intervals(data_df, 
                                       train_val_time, 
                                       base_days, 
                                       step_days,
                                       ahead_days, 
                                       today, 
                                       fe.feature_close_one_to_one)    
    print(pid + ') Intervals generated')
    
    # Drop "bad" samples and fill missing data
    x_y_df = pd.concat([x, y], axis=1)
    x_y_df = pp.drop_irrelevant_samples(x_y_df, params['SAMPLES_GOOD_DATA_RATIO'])
    x = x_y_df.iloc[:, :-1]
    y = x_y_df.iloc[:, -1]
    x = pp.fill_missing(x)
    print(pid + ') Irrelevant samples dropped and missing data filled.')
    
    # Pickle that
    x.to_pickle('../../data/x_{}_test.pkl'.format(pid))
    y.to_pickle('../../data/y_{}_test.pkl'.format(pid))
    
    toc = time()
    print('%s) %i intervals generated in: %i seconds.' % (pid, x.shape[0], (toc-tic)))
    
    return pid, x, y

In [64]:
for ind in range(best_ini_pred_df.shape[0]):
    pid, x, y = generate_one_set(best_ini_pred_df.iloc[ind,:], data_test_df)


Generating: base112_ahead1
base112_ahead1) data_df loaded
base112_ahead1) Irrelevant symbols dropped.
base112_ahead1) Intervals generated
base112_ahead1) Irrelevant samples dropped and missing data filled.
base112_ahead1) 15957 intervals generated in: 2 seconds.
Generating: base112_ahead7
base112_ahead7) data_df loaded
base112_ahead7) Irrelevant symbols dropped.
base112_ahead7) Intervals generated
base112_ahead7) Irrelevant samples dropped and missing data filled.
base112_ahead7) 15673 intervals generated in: 2 seconds.
Generating: base112_ahead14
base112_ahead14) data_df loaded
base112_ahead14) Irrelevant symbols dropped.
base112_ahead14) Intervals generated
base112_ahead14) Irrelevant samples dropped and missing data filled.
base112_ahead14) 15388 intervals generated in: 2 seconds.
Generating: base112_ahead28
base112_ahead28) data_df loaded
base112_ahead28) Irrelevant symbols dropped.
base112_ahead28) Intervals generated
base112_ahead28) Irrelevant samples dropped and missing data filled.
base112_ahead28) 14818 intervals generated in: 2 seconds.
Generating: base112_ahead56
base112_ahead56) data_df loaded
base112_ahead56) Irrelevant symbols dropped.
base112_ahead56) Intervals generated
base112_ahead56) Irrelevant samples dropped and missing data filled.
base112_ahead56) 13678 intervals generated in: 2 seconds.

In [67]:
x = pd.read_pickle('../../data/x_base112_ahead7_test.pkl')
x


Out[67]:
0 1 2 3 4 5 6 7 8 9 ... 102 103 104 105 106 107 108 109 110 111
2015-05-14 AES 1.0 0.996948 0.990166 0.975246 0.976263 0.975076 0.956087 0.933876 0.940827 0.924890 ... 0.802136 0.874364 0.874025 0.883011 0.870973 0.884368 0.882333 0.889115 0.881655 0.866904
2015-05-26 LH 1.0 0.996912 0.992405 0.984393 0.990486 0.984811 0.979553 0.999249 0.996077 1.005926 ... 0.979720 0.981305 0.968453 0.913954 0.934318 0.982641 1.007094 1.024954 1.029461 1.024370
2015-01-13 AET 1.0 1.014385 1.020850 1.027477 1.026830 1.036690 1.035720 1.042832 1.043640 1.038468 ... 0.871020 0.877808 0.871666 0.867141 0.868434 0.873768 0.886213 0.880394 0.876677 0.862130
2015-03-05 DHR 1.0 0.972875 0.954134 0.936873 0.950847 0.939832 0.948545 0.965806 0.954299 0.990301 ... 0.831991 0.812428 0.795002 0.801249 0.771823 0.794016 0.762946 0.787112 0.777577 0.787112
2015-11-02 TIF 1.0 0.997919 1.005277 1.003047 0.998216 0.998216 1.011075 1.002378 0.984391 0.983574 ... 1.052401 1.060428 1.070685 1.050840 1.078564 1.055820 1.057009 1.052921 1.058644 1.078118
2015-09-14 MCHP 1.0 1.015770 1.018873 1.003878 0.956050 0.942089 0.875905 0.826784 0.838935 0.830920 ... 0.616856 0.605481 0.639866 0.592296 0.608583 0.654085 0.664943 0.630817 0.650465 0.630300
2015-05-14 MAS 1.0 1.004813 1.000633 1.008360 1.003293 1.005446 0.997847 0.992527 0.998480 0.998987 ... 1.048132 1.051552 1.055731 1.045725 1.035592 1.036225 1.047118 1.046485 1.043065 1.050412
2015-03-05 FMC 1.0 0.976036 0.965069 0.936231 0.922015 0.939480 0.934200 0.963038 0.949228 0.965069 ... 0.756702 0.758733 0.738830 0.742486 0.746141 0.741673 0.736393 0.749797 0.726645 0.728676
2015-03-25 UNM 1.0 0.996394 1.003778 1.020350 1.006354 0.995363 1.005753 1.012021 1.008844 1.011850 ... 0.843294 0.822772 0.798643 0.780268 0.762494 0.786193 0.800876 0.800618 0.786622 0.770393
2015-03-05 TROW 1.0 0.984706 0.980882 0.964118 0.959412 0.974412 0.963529 0.972353 0.967647 0.987941 ... 1.023529 1.021765 1.019412 1.017059 1.016765 1.007059 1.006176 1.022941 1.019118 1.000588
2015-06-15 R 1.0 1.013201 1.018557 1.035202 1.031758 1.031184 1.035393 1.027549 1.034437 1.044959 ... 1.185575 1.173522 1.189593 1.183662 1.168357 1.142912 1.160895 1.158408 1.182323 1.175818
2016-06-14 BK 1.0 1.004254 1.004254 1.013370 1.025828 1.027347 1.025220 1.050744 0.973868 0.931936 ... 1.046187 1.040717 1.169553 1.150106 1.130963 1.176542 1.182315 1.169857 1.180188 1.181708
2015-05-26 WEC 1.0 1.007366 1.009881 1.005390 1.006827 1.009342 1.022458 1.008624 1.017068 1.008264 ... 0.944305 0.953647 0.954366 0.971254 0.983651 0.975386 0.972332 0.996227 0.989759 0.972691
2015-07-24 KMB 1.0 0.995829 1.009385 1.011470 1.016684 1.022941 1.027112 1.005214 1.018770 1.009385 ... 1.093326 1.059437 1.074296 1.097497 1.102972 1.106882 1.106621 1.111575 1.102972 1.090459
2015-04-24 CTL 1.0 0.997253 0.994217 0.990458 0.972676 0.978459 0.981061 0.974122 0.980049 0.981350 ... 0.906318 0.912245 0.898655 0.897788 0.899957 0.912823 0.898655 0.911811 0.917450 0.921209
2015-04-06 ORCL 1.0 0.998578 1.002642 1.003658 1.001422 0.996952 0.993700 0.995326 0.996545 0.981101 ... 0.913432 0.907539 0.883154 0.904085 0.908352 0.900427 0.926235 0.921357 0.920545 0.928876
2016-03-24 MCHP 1.0 1.025679 1.026045 1.031548 0.990462 0.984593 0.946809 0.930668 0.931768 0.907924 ... 1.025679 1.029714 1.041453 1.073734 1.043287 1.045488 1.012472 1.025312 1.117021 1.103081
2015-11-20 NSC 1.0 0.979500 0.969147 0.977860 0.977655 0.974375 0.971812 0.944547 0.954387 0.943624 ... 0.852399 0.846966 0.936142 0.936552 0.953772 0.955617 0.942804 0.923637 0.929889 0.913694
2016-02-24 SWKS 1.0 1.007463 1.022162 1.011986 1.037540 1.037766 1.037313 1.045228 1.043193 1.031886 ... 1.241067 1.231343 1.239484 1.235188 1.225011 1.221845 1.228177 1.224785 1.233605 1.224107
2016-04-05 TMO 1.0 1.006903 0.985634 0.996082 0.999813 1.021642 1.033582 1.034515 1.036567 1.041045 ... 1.194216 1.204291 1.206716 1.205970 1.214179 1.213619 1.209701 1.209888 1.200933 1.207276
2015-12-11 WMT 1.0 0.956508 0.980670 0.981959 0.956830 0.948131 0.951353 0.961501 0.971649 0.986147 ... 0.618073 0.591817 0.585696 0.570876 0.572970 0.591978 0.619523 0.618073 0.637242 0.651740
2015-03-05 LH 1.0 0.987184 0.982884 0.984101 0.983452 1.008112 1.003650 1.019387 1.016142 1.028634 ... 1.018981 1.032528 1.033014 1.030256 1.032933 1.018008 1.011843 1.015493 1.009815 1.005435
2015-04-06 LNT 1.0 0.992061 1.000541 1.015698 1.021112 1.036810 1.037532 1.051425 1.049441 1.036268 ... 1.100686 1.088777 1.059726 1.066402 1.072717 1.047997 1.068928 1.056117 1.050523 1.052508
2015-02-24 BLL 1.0 0.983608 0.979711 0.969395 0.964580 0.958161 0.960912 0.975928 0.955238 0.959652 ... 1.021492 1.028198 1.028485 1.076055 1.089752 1.105857 1.115028 1.106717 1.127063 1.124427
2015-03-16 CSCO 1.0 0.997821 0.989902 0.995801 0.996226 0.978953 0.977252 0.980122 0.983736 0.989476 ... 0.933298 0.921552 0.923625 0.924103 0.921020 0.924794 0.930215 0.908318 0.884773 0.863247
2015-01-23 KSS 1.0 1.006128 1.008014 1.000943 1.018032 0.997407 1.000000 1.026046 1.019564 1.074720 ... 1.009193 1.023807 1.032174 1.022157 1.026164 1.037831 1.002711 1.010371 1.015203 1.017796
2016-04-25 STI 1.0 1.013857 1.022624 1.014989 0.944853 0.949095 0.947681 0.922370 0.909926 0.913886 ... 1.075933 1.070701 1.085124 1.087387 1.057268 1.029695 1.045249 1.050198 1.035916 1.076640
2016-05-04 D 1.0 1.006203 1.005197 1.013998 1.044342 1.024141 1.028080 1.027578 1.036798 1.030511 ... 1.126739 1.117603 1.128080 1.122967 1.117100 1.106873 1.108215 1.090863 1.098994 1.078374
2015-12-22 PAYX 1.0 1.004924 1.008729 1.012086 1.019024 1.017234 1.011862 0.975604 0.988585 0.988362 ... 1.007833 1.024843 1.008057 1.029096 1.018800 1.078559 1.094673 1.092435 1.094002 1.103402
2016-02-03 CSCO 1.0 0.988012 0.952349 0.939687 0.959017 0.991159 1.005994 1.000824 1.033116 1.043905 ... 1.272571 1.285457 1.328688 1.329287 1.331161 1.337379 1.362329 1.364352 1.337679 1.339852
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2015-03-16 EQT 1.0 0.990920 0.977329 0.980431 0.970223 0.962328 0.953982 0.932946 0.933284 0.928209 ... 0.955335 0.949865 0.932269 0.922626 0.929450 0.930352 0.934243 0.930239 0.904861 0.879935
2015-12-22 CINF 1.0 1.026151 1.029562 1.033542 1.049460 1.040364 1.043775 1.038658 1.048323 1.035247 ... 1.244457 1.254690 1.249574 1.252416 1.257533 1.270608 1.279704 1.280841 1.284252 1.284821
2016-05-13 ESRX 1.0 0.996238 0.985978 0.974008 0.966142 0.973324 0.972640 0.986320 0.998632 0.990766 ... 0.844049 0.855335 0.849179 0.846785 0.846785 0.850205 0.847811 0.857387 0.865937 0.867305
2016-04-25 PCG 1.0 0.996652 0.997870 0.990262 0.992088 1.006391 1.009738 0.999087 1.000609 1.003043 ... 0.946439 0.944005 0.949178 0.951613 0.947961 0.950091 0.956482 0.959525 0.951004 0.958612
2015-12-02 PX 1.0 0.984692 1.005156 1.017241 1.024009 1.017080 0.986304 0.991460 0.990171 1.000000 ... 1.170641 1.181921 1.175959 1.188044 1.179826 1.175959 1.185466 1.185305 1.188205 1.199484
2015-08-24 KIM 1.0 0.975213 1.008527 1.018045 1.025580 1.011898 1.001785 1.009122 1.022011 1.000793 ... 0.905215 0.931787 0.954987 0.939124 0.963514 0.965893 0.951616 0.986516 0.973230 0.999009
2015-11-02 TXN 1.0 1.008464 0.999637 1.015599 0.990447 0.970012 0.968440 0.935792 0.928900 0.900484 ... 0.887304 0.892019 0.874123 0.867594 0.873640 0.856832 0.854051 0.856469 0.849335 0.868198
2016-02-03 CINF 1.0 0.994751 0.989501 0.967454 0.973753 0.971654 0.926509 0.943307 0.950131 0.962730 ... 1.259843 1.259843 1.262467 1.262467 1.262467 1.234646 1.254593 1.249869 1.244094 1.256693
2015-09-23 CMI 1.0 0.997886 0.993894 0.939878 0.952560 0.995773 1.009394 1.043917 1.087600 1.105683 ... 1.105683 1.124706 1.103335 1.108267 1.129638 1.144669 1.141616 1.175904 1.165336 1.164866
2016-03-24 KO 1.0 0.997854 1.005007 1.017883 1.018240 1.026109 1.006438 0.986409 1.001431 0.987124 ... 1.090129 1.091559 1.095494 1.108011 1.110873 1.119099 1.121245 1.129471 1.128040 1.124464
2015-10-22 AES 1.0 0.996197 1.019395 1.028713 1.055334 1.046967 1.049439 1.081765 1.101540 1.099639 ... 1.106104 1.102491 1.089371 1.090892 1.085948 1.107245 1.100970 1.108766 1.098498 1.106484
2015-07-24 UNM 1.0 0.995569 0.996576 1.000503 1.014299 1.010069 1.001108 0.997181 0.995569 0.996677 ... 0.945222 0.929816 0.942604 0.955292 0.969691 0.969691 0.967576 0.979458 0.975733 0.967375
2016-03-15 BA 1.0 0.995917 0.990474 0.981175 0.978453 0.973917 0.982309 0.999546 1.004309 1.007938 ... 1.058290 1.051486 1.046723 1.056475 1.049671 1.038784 1.050352 1.062826 1.064641 1.067135
2016-03-04 HES 1.0 0.977572 0.963278 0.982871 0.981392 0.991004 0.980776 0.956993 0.965619 0.967591 ... 1.067406 1.055453 1.084042 1.085274 1.087492 1.093530 1.096365 1.106470 1.108811 1.107579
2016-04-25 GPC 1.0 1.001443 1.007215 1.005195 1.001732 1.014141 1.036797 1.073304 1.063925 1.061616 ... 1.056999 1.055988 1.072727 1.097980 1.102453 1.108225 1.089755 1.103175 1.093939 1.078932
2016-06-14 UDR 1.0 1.011154 1.024362 1.023188 1.023775 1.032873 1.036983 1.033167 1.019959 1.022894 ... 1.017317 1.030525 1.011741 0.987085 0.998826 1.026710 0.991195 0.992956 0.978574 0.972997
2015-03-25 LUV 1.0 0.968561 0.974607 1.009674 0.984281 0.992140 0.981258 0.972189 0.973398 0.976421 ... 0.853688 0.848851 0.833736 0.814389 0.800484 0.805925 0.829504 0.831318 0.859129 0.827690
2015-10-13 VZ 1.0 0.987694 1.005274 1.004505 0.982529 0.979892 0.968025 1.025272 1.038347 1.010878 ... 1.134161 1.144160 1.139985 1.137457 1.138117 1.150313 1.158334 1.152730 1.153170 1.149324
2016-05-04 CAG 1.0 1.012031 1.009143 1.017709 1.021174 1.002502 1.009143 0.997786 1.006256 0.982772 ... 0.867950 0.861886 0.856497 0.838210 0.829836 0.829644 0.832435 0.831569 0.835707 0.831569
2015-08-13 SPGI 1.0 1.003642 1.009249 1.006326 0.998371 0.977715 0.947139 0.908416 0.897297 0.933001 ... 0.919774 0.920684 0.928113 0.904965 0.919822 0.900077 0.901275 0.889725 0.894709 0.913064
2016-01-04 JBHT 1.0 1.031456 1.022136 0.996505 0.979417 0.994951 0.990680 0.979029 0.981748 0.986408 ... 1.094369 1.104466 1.116893 1.121553 1.114951 1.120777 1.134757 1.137087 1.136699 1.133981
2016-05-13 BK 1.0 1.005993 1.000856 1.016267 1.004852 1.014840 1.017979 1.043664 1.069920 1.056792 ... 1.025114 1.038242 0.992580 0.991724 0.981735 0.972888 0.965183 0.976884 0.994292 0.986872
2015-01-02 ADM 1.0 0.942065 0.948992 0.973552 1.001259 1.014484 1.018262 0.994962 0.941436 0.942695 ... 0.782116 0.794710 0.794081 0.787154 0.782116 0.771411 0.767632 0.766373 0.761335 0.759446
2015-09-14 QCOM 1.0 1.031346 1.031592 1.031223 0.993977 0.998156 0.959926 0.943823 0.922065 0.923540 ... 0.684081 0.680025 0.644991 0.665519 0.691948 0.714567 0.712477 0.690227 0.719361 0.696128
2015-06-15 PCG 1.0 1.004349 1.010149 1.028271 1.017760 1.028634 1.031171 1.023559 1.021022 1.026459 ... 1.160203 1.140631 1.139906 1.144618 1.125045 1.113447 1.121058 1.114534 1.138094 1.138094
2015-08-13 HRS 1.0 1.005307 1.007202 1.006539 0.995451 0.972707 0.937074 0.902957 0.890163 0.927123 ... 0.932809 0.932714 0.939822 0.920205 0.935747 0.921721 0.922384 0.912055 0.915277 0.927028
2015-05-14 AVY 1.0 0.988512 1.001532 1.015573 1.010722 1.009957 1.006127 1.010722 1.018892 1.018381 ... 0.942303 0.932857 0.937197 0.924687 0.912688 0.941026 0.931580 0.925453 0.938218 0.926219
2015-03-05 VNO 1.0 0.995117 1.000158 0.977788 0.965974 0.971251 0.976686 1.006065 1.017250 1.022290 ... 1.066950 1.063327 1.079001 1.105072 1.112319 1.098299 1.084909 1.099795 1.070180 1.084121
2015-06-15 FITB 1.0 1.007701 1.006093 1.025812 1.013879 1.028182 1.034276 1.032583 1.017519 1.011341 ... 0.916892 0.914946 0.921209 0.907160 0.882109 0.868060 0.878555 0.884648 0.915115 0.901405
2016-01-13 BF.B 1.0 1.022664 0.983248 0.969453 0.965840 0.980292 1.001478 0.951059 0.971588 0.957793 ... 1.267203 1.248645 1.235507 1.229430 1.232879 1.231565 1.247003 1.255214 1.256200 1.255214

15673 rows × 112 columns


In [70]:
x.iloc[10].plot()


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x7faf6199d7b8>

The datasets were successfully generated

There will be two types of test. One is retraining at every step, and the other is without retraining. The first one is good enough to confirm that there was no overfitting on the hyperparameters. The second can show how "valid" is the model for periods outside the training period. If there is no time dependence in the results, the test without retraining may be the only one perfomed.


In [2]:
best_params_df = pd.read_pickle('../../data/best_params_final_df.pkl')
best_params_df


Out[2]:
GOOD_DATA_RATIO SAMPLES_GOOD_DATA_RATIO ahead_days base_days model mre r2 step_days train_days train_val_time x_filename y_filename
ahead_days
1.0 0.99 0.9 1.0 112.0 linear 0.015856 0.986599 7.0 504.0 -1.0 x_base112_ahead1.pkl y_base112_ahead1.pkl
7.0 0.99 0.9 7.0 112.0 linear 0.042367 0.923348 7.0 756.0 -1.0 x_base112_ahead7.pkl y_base112_ahead7.pkl
14.0 0.99 0.9 14.0 112.0 linear 0.060167 0.865259 7.0 756.0 -1.0 x_base112_ahead14.pkl y_base112_ahead14.pkl
28.0 0.99 0.9 28.0 112.0 linear 0.091966 0.758046 7.0 756.0 -1.0 x_base112_ahead28.pkl y_base112_ahead28.pkl
56.0 0.99 0.9 56.0 112.0 linear 0.127913 0.590426 7.0 756.0 -1.0 x_base112_ahead56.pkl y_base112_ahead56.pkl

Ahead 1

Without retraining

Warning: The dates that appear on the samples are the initial dates (there are 112 days ahead after the marked date).


In [59]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev

ahead_days = 1

# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])

pid = 'base{}_ahead{}'.format(base_days, ahead_days)

# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()

# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])

# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)

# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)

# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)

# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))

# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()


Mean metrics: 
        train      test
r2   0.983486  0.976241
mre  0.008762  0.013906
----------------------------------------------------------------------
Out[59]:
<matplotlib.legend.Legend at 0x7f10dfa9a9e8>

Ahead 7


In [60]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev

ahead_days = 7

# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])

pid = 'base{}_ahead{}'.format(base_days, ahead_days)

# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()

# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])

# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)

# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)

# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)

# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))

# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()


Mean metrics: 
        train      test
r2   0.906177  0.874892
mre  0.026232  0.034764
----------------------------------------------------------------------
Out[60]:
<matplotlib.legend.Legend at 0x7f10df101160>

Ahead 14


In [61]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev

ahead_days = 14

# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])

pid = 'base{}_ahead{}'.format(base_days, ahead_days)

# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()

# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])

# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)

# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)

# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)

# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))

# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()


Mean metrics: 
        train      test
r2   0.826779  0.758697
mre  0.037349  0.051755
----------------------------------------------------------------------
Out[61]:
<matplotlib.legend.Legend at 0x7f10df0b1a90>

Ahead 28


In [62]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev

ahead_days = 28

# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])

pid = 'base{}_ahead{}'.format(base_days, ahead_days)

# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()

# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])

# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)

# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)

# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)

# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))

# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()


Mean metrics: 
        train      test
r2   0.696077  0.515802
mre  0.052396  0.078545
----------------------------------------------------------------------
Out[62]:
<matplotlib.legend.Legend at 0x7f10df4ce390>

Ahead 56


In [63]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev

ahead_days = 56

# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])

pid = 'base{}_ahead{}'.format(base_days, ahead_days)

# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()

# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])

# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)

# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)

# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)

# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))

# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()


Mean metrics: 
        train      test
r2   0.494079  0.152134
mre  0.073589  0.108190
----------------------------------------------------------------------
Out[63]:
<matplotlib.legend.Legend at 0x7f10def4ea20>

Conclusion: The metrics are a bit worse for the test set, as expected, but it seems to be reasonably close. There is no clear tendency in time, so a "time validity" for the model couldn't be found within the studied periods.

NOTE: A gap can be seen in the dates between the training and test sets. That is due to using only test samples that have test features (x) and labels (y). It would be acceptable to use features in the training set to predict labels in the test set. In this case, it was considered that, since the studied case is even more restrictive than the other one, no training features with labels in the test set were used.


In [ ]: