In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
from sklearn.externals import joblib
In [2]:
best_params_df = pd.read_pickle('../../data/best_params_final_df.pkl')
best_params_df
Out[2]:
In [3]:
import predictor.feature_extraction as fe
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 1
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[3]:
In [4]:
print('The first training day for the predictor is: {}.'.format(start_date))
In [5]:
print('The last training day for the predictor is: {}.'.format(fe.add_market_days(end_date, base_days)))
In [6]:
print('The testing data for the recommender')
total_data_test_df = pd.read_pickle('../../data/data_test_df.pkl').stack(level='feature')
total_data_test_df.head()
Out[6]:
In [7]:
print('The first TEST day for the recommender is: {}'.format(total_data_test_df.index[-0]))
In [8]:
joblib.dump(estimator, '../../data/best_predictor.pkl')
Out[8]:
Let's test the saved predictor... just in case.
In [9]:
estimator_reloaded = joblib.load('../../data/best_predictor.pkl')
# Get the training and test predictions
y_train_pred = estimator_reloaded.predict(x_sub_df)
y_test_pred = estimator_reloaded.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[9]:
Looks good to me.
In [17]:
# Get the data
SYMBOL = 'SPY'
total_data_train_df = pd.read_pickle('../../data/data_train_val_df.pkl').stack(level='feature')
data_train_df = total_data_train_df[SYMBOL].unstack()[['Close', 'Volume']]
In [18]:
data_train_df.head()
Out[18]:
In [41]:
def generate_samples(data_df):
start_date = data_df.index[0]
close_sample = pd.DataFrame(data_df['Close'].values, columns=[start_date]).T
close_sample = close_sample / close_sample.iloc[0,0]
volume_sample = pd.DataFrame(data_df['Volume'].values, columns=[start_date]).T
volume_sample = volume_sample / volume_sample.iloc[0,0]
return close_sample, volume_sample
In [42]:
data_df = data_train_df[:112]
In [43]:
start_date = data_df.index[0]
close_sample = pd.DataFrame(data_df['Close'].values, columns=[start_date]).T
close_sample = close_sample / close_sample.iloc[0,0]
volume_sample = pd.DataFrame(data_df['Volume'].values, columns=[start_date]).T
volume_sample = volume_sample / volume_sample.iloc[0,0]
In [44]:
close_sample
Out[44]:
In [45]:
close_sample, volume_sample = generate_samples(data_df)
In [46]:
close_sample
Out[46]:
In [47]:
volume_sample
Out[47]:
In [155]:
history_df = data_train_df[:112]
In [156]:
estimator_close = joblib.load('../../data/best_predictor.pkl')
estimator_volume = joblib.load('../../data/best_volume_predictor.pkl')
In [157]:
h_history_df = history_df.copy()
In [167]:
def predict_one_step(h_history_df, keep=False):
close_sample, volume_sample = generate_samples(h_history_df)
estimated_close = estimator_close.predict(close_sample).iloc[0,0] * h_history_df['Close'].iloc[0]
estimated_volume = estimator_volume.predict(volume_sample).iloc[0,0] * h_history_df['Volume'].iloc[0]
predicted_date = fe.add_market_days(h_history_df.index[-1], 1)
h_history_df = h_history_df.drop(h_history_df.index[0])
h_history_df.loc[predicted_date,:] = {'Close': estimated_close,'Volume': estimated_volume}
return h_history_df
In [159]:
close_sample, volume_sample = generate_samples(h_history_df)
estimated_close = estimator_close.predict(close_sample).iloc[0,0] * h_history_df['Close'].iloc[0]
estimated_volume = estimator_volume.predict(volume_sample).iloc[0,0] * h_history_df['Volume'].iloc[0]
In [160]:
estimator_close.predict(close_sample).iloc[0,0]
Out[160]:
In [161]:
predicted_date = fe.add_market_days(h_history_df.index[-1], 1)
predicted_date
Out[161]:
In [162]:
history_df
Out[162]:
In [163]:
h_history_df = h_history_df.drop(h_history_df.index[0])
h_history_df.loc[predicted_date,:] = {'Close': estimated_close,'Volume': estimated_volume}
h_history_df
Out[163]:
In [164]:
h_history_df = history_df.copy()
for i in range(20):
h_history_df = predict_one_step(h_history_df.copy())
In [177]:
h_history_df = history_df.copy()
predicted_df = pd.DataFrame()
for i in range(112):
h_history_df = predict_one_step(h_history_df.copy())
predicted_df = predicted_df.append(h_history_df.iloc[-1])
In [178]:
predicted_df
Out[178]:
In [180]:
real_df = history_df.append(data_train_df[112:224])
plt.plot(real_df.index, real_df['Close'], 'b', label='real')
plt.plot(predicted_df.index, predicted_df['Close'], 'r', label='predicted')
plt.legend()
plt.show()
In [ ]:
In [ ]:
In [ ]: