In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp
import utils.misc as misc
In [24]:
best_raw_params = pd.read_pickle('../../data/best_dataset_params_raw_df.pkl')
def keep_max_r2(record):
return record.loc[np.argmax(record['r2']),:]
best_ini_pred_df = best_raw_params.groupby('ahead_days').apply(keep_max_r2)
best_ini_pred_df
Out[24]:
Those were the best predictors before the hyperparameters tuning
In [25]:
hyper1_df = pd.read_pickle('../../data/hyper_ahead1_random_forest_df.pkl')
hyper1_df
Out[25]:
In [26]:
from sklearn.metrics import r2_score
hyper_1_best_series = hyper1_df.iloc[np.argmax(hyper1_df['r2'])].copy()
hyper_1_best_series.name = 1
hyper_1_best_series
Out[26]:
In [27]:
ahead_days = [1, 7, 14, 28, 56]
best_hyper_df = pd.DataFrame()
best_hyper_df.index.name = 'ahead_days'
for ahead in ahead_days:
hyper_df = pd.read_pickle('../../data/hyper_ahead{}_random_forest_df.pkl'.format(ahead))
hyper_best_series = hyper_df.iloc[np.argmax(hyper_df['r2'])].copy()
hyper_best_series.name = ahead
best_hyper_df = best_hyper_df.append(hyper_best_series)
best_hyper_df
Out[27]:
In [28]:
def join_and_compare(df1, df2, column, labels):
tj1 = pd.DataFrame(df1[column].copy())
tj1.rename(columns = {column: labels[0]}, inplace=True)
tj2 = pd.DataFrame(df2[column].copy())
tj2.rename(columns = {column: labels[1]}, inplace=True)
comp_df = tj1.join(tj2)
comp_df['diff'] = comp_df[labels[1]] - comp_df[labels[0]]
return comp_df
In [29]:
comp_r2_df = join_and_compare(best_ini_pred_df, best_hyper_df, 'r2', ['linear', 'random_forest'])
comp_r2_df['best'] = comp_r2_df.apply(lambda x: np.argmax(x), axis=1)
comp_r2_df
Out[29]:
The values are very similar in both cases. A minor difference can be seen only in the case of 56 days ahead, in which the random forest seems to be a bit better than the linear predictor. In any case, as the linear predictor is much simpler, and faster, it's probably better to keep it as the best predictor. It can be seen that this scenario is different from the one before hyperparameter tuning, in which the linear predictor was always better.
In [30]:
comp_mre_df = join_and_compare(best_ini_pred_df, best_hyper_df, 'mre', ['linear', 'random_forest'])
comp_mre_df['best'] = comp_mre_df.apply(lambda x: np.argmax(x), axis=1)
comp_mre_df
Out[30]:
The values for the MRE metrics are almost the same for both predictors.
In [31]:
data_test_df = pd.read_pickle('../../data/data_test_df.pkl')
data_test_df.head()
Out[31]:
When generating the datasets, some symbols were removed from the training set (because they contained too many missing points). The same symbols should be removed from the test set.
In [33]:
best_ini_pred_df
Out[33]:
In [34]:
best_ini_pred_df.to_pickle('../../data/best_params_final_df.pkl')
Some playing with the data to remove the same symbols as in the training set
In [54]:
params = best_ini_pred_df.loc[1]
In [55]:
train_val_time = int(params['train_val_time'])
base_days = int(params['base_days'])
step_days = int(params['step_days'])
ahead_days = int(params['ahead_days'])
print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
In [56]:
y_train_df = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
y_train_df.head()
Out[56]:
In [57]:
kept_symbols = y_train_df.index.get_level_values(1).unique().tolist()
len(kept_symbols)
Out[57]:
In [58]:
len(data_test_df.columns.get_level_values(1).unique().tolist())
Out[58]:
In [59]:
filtered_data_test_df = data_test_df.loc[:, (slice(None), kept_symbols)]
In [60]:
len(filtered_data_test_df.columns.get_level_values(1).unique().tolist())
Out[60]:
In [62]:
def generate_one_test_set(params, data_df):
# print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
tic = time()
train_val_time = int(params['train_val_time'])
base_days = int(params['base_days'])
step_days = int(params['step_days'])
ahead_days = int(params['ahead_days'])
print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Getting the data
today = data_df.index[-1] # Real date
print(pid + ') data_df loaded')
# Drop symbols with many missing points
y_train_df = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
kept_symbols = y_train_df.index.get_level_values(1).unique().tolist()
data_df = data_df.loc[:, (slice(None), kept_symbols)]
print(pid + ') Irrelevant symbols dropped.')
# Generate the intervals for the predictor
x, y = fe.generate_train_intervals(data_df,
train_val_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_close_one_to_one)
print(pid + ') Intervals generated')
# Drop "bad" samples and fill missing data
x_y_df = pd.concat([x, y], axis=1)
x_y_df = pp.drop_irrelevant_samples(x_y_df, params['SAMPLES_GOOD_DATA_RATIO'])
x = x_y_df.iloc[:, :-1]
y = x_y_df.iloc[:, -1]
x = pp.fill_missing(x)
print(pid + ') Irrelevant samples dropped and missing data filled.')
# Pickle that
x.to_pickle('../../data/x_{}_test.pkl'.format(pid))
y.to_pickle('../../data/y_{}_test.pkl'.format(pid))
toc = time()
print('%s) %i intervals generated in: %i seconds.' % (pid, x.shape[0], (toc-tic)))
return pid, x, y
In [64]:
for ind in range(best_ini_pred_df.shape[0]):
pid, x, y = generate_one_set(best_ini_pred_df.iloc[ind,:], data_test_df)
In [67]:
x = pd.read_pickle('../../data/x_base112_ahead7_test.pkl')
x
Out[67]:
In [70]:
x.iloc[10].plot()
Out[70]:
The datasets were successfully generated
In [2]:
best_params_df = pd.read_pickle('../../data/best_params_final_df.pkl')
best_params_df
Out[2]:
Warning: The dates that appear on the samples are the initial dates (there are 112 days ahead after the marked date).
In [59]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 1
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[59]:
In [60]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 7
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[60]:
In [61]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 14
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[61]:
In [62]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 28
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[62]:
In [63]:
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 56
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[63]:
In [ ]: