In [90]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
from utils import preprocessing as pp
In [91]:
%pwd
Out[91]:
In [92]:
# Getting the data
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
sys.path.append('../../')
import predictor.feature_extraction as fe
train_time = -1 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
tic = time()
x, y = fe.generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))
print(data_df.shape)
In [93]:
from predictor import evaluation as ev
from predictor.dummy_mean_predictor import DummyPredictor
In [94]:
predictor = DummyPredictor()
In [95]:
y_train_true_df, y_train_pred_df, y_val_true_df, y_val_pred_df = ev.run_single_val(x, y, ahead_days, predictor)
In [96]:
print(y_train_true_df.shape)
print(y_train_pred_df.shape)
print(y_val_true_df.shape)
print(y_val_pred_df.shape)
In [97]:
y_train_true_df.head()
Out[97]:
In [98]:
y_train_pred_df.head()
Out[98]:
In [99]:
y_val_true_df.head()
Out[99]:
In [11]:
y_val_pred_df.head()
Out[11]:
In [12]:
y_train_true_rs = ev.reshape_by_symbol(y_train_true_df)
print(y_train_true_rs.shape)
y_train_true_rs.head()
Out[12]:
In [13]:
y_train_pred_rs = ev.reshape_by_symbol(y_train_pred_df)
print(y_train_pred_rs.shape)
y_train_pred_rs.head()
Out[13]:
In [14]:
y_val_true_rs = ev.reshape_by_symbol(y_val_true_df)
print(y_val_true_rs.shape)
y_val_true_rs.head()
Out[14]:
In [15]:
u = x.index.levels[0][0]
print(u)
In [16]:
fe.SPY_DF.sort_index().index.unique()
Out[16]:
In [17]:
md = fe.SPY_DF.index.unique()
In [18]:
u in md
Out[18]:
In [19]:
fe.add_market_days(u,6)
Out[19]:
In [101]:
# Getting the data
GOOD_DATA_RATIO = 0.99
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp
data_df = pp.drop_irrelevant_symbols(data_df, GOOD_DATA_RATIO)
train_time = -1 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
tic = time()
x, y = fe.generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))
In [102]:
print(data_df.shape)
data_df.head()
Out[102]:
In [103]:
SAMPLES_GOOD_DATA_RATIO = 0.9
x_y_df = pd.concat([x, y], axis=1)
x_y_df = pp.drop_irrelevant_samples(x_y_df, SAMPLES_GOOD_DATA_RATIO)
x = x_y_df.iloc[:, :-1]
y = x_y_df.iloc[:, -1]
x = pp.fill_missing(x)
In [104]:
x_y_df.isnull().sum()
Out[104]:
In [105]:
x.isnull().sum().sum()
Out[105]:
In [106]:
y.isnull().sum()
Out[106]:
In [107]:
x_reshaped = ev.reshape_by_symbol(x)
x_reshaped.head()
Out[107]:
In [108]:
x_reshaped.isnull().sum().max()
Out[108]:
In [109]:
x.shape
Out[109]:
In [110]:
x_reshaped.shape
Out[110]:
In [111]:
x_reshaped[x_reshaped.notnull()]
Out[111]:
In [112]:
y_train_true_df, y_train_pred_df, y_val_true_df, y_val_pred_df = ev.run_single_val(x, y, ahead_days, predictor)
In [113]:
from sklearn.metrics import r2_score
r2_score(y_train_true_df, y_train_pred_df, multioutput='raw_values')
Out[113]:
In [117]:
tickers = y_train_true_df.index.levels[1]
tickers
Out[117]:
In [127]:
y_train_true_df.loc[(slice(None), 'AAPL'),:]
Out[127]:
In [128]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
r2_train_score = []
mae_train = []
for ticker in tickers:
y_true = y_train_true_df.loc[(slice(None), 'AAPL'),:]
y_pred = y_train_pred_df.loc[(slice(None), 'AAPL'),:]
r2_train_score.append(r2_score(y_true, y_pred))
mae_train.append(mean_absolute_error(y_true, y_pred))
In [132]:
np.mean(r2_train_score)
Out[132]:
In [133]:
np.mean(mae_train)
Out[133]:
In [ ]:
In [72]:
train_days = 252
step_eval_days = 252
r2_train_means, r2_train_stds, y_val_true_df, y_val_pred_df = ev.roll_evaluate(x,
y,
train_days,
step_eval_days,
ahead_days,
predictor,
verbose=True)
In [ ]:
print(len(r2_train_means))
print(len(r2_train_stds))
print(y_val_true_df.shape)
print(y_val_pred_df)
In [27]:
plt.plot(r2_train_means)
Out[27]:
In [28]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_val_true_df, y_val_pred_df)
mae
In [ ]: