In [38]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
from sklearn.externals import joblib
import utils.preprocessing as pp
import predictor.feature_extraction as fe
In [39]:
def generate_one_set(params):
# print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
tic = time()
train_val_time = int(params['train_val_time'])
base_days = int(params['base_days'])
step_days = int(params['step_days'])
ahead_days = int(params['ahead_days'])
print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Getting the data
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
today = data_df.index[-1] # Real date
print(pid + ') data_df loaded')
# Drop symbols with many missing points
data_df = pp.drop_irrelevant_symbols(data_df, params['GOOD_DATA_RATIO'])
print(pid + ') Irrelevant symbols dropped.')
# Generate the intervals for the predictor
x, y = fe.generate_train_intervals(data_df,
train_val_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_volume_one_to_one,
target_feature=fe.VOLUME_FEATURE)
print(pid + ') Intervals generated')
# Drop "bad" samples and fill missing data
x_y_df = pd.concat([x, y], axis=1)
x_y_df = pp.drop_irrelevant_samples(x_y_df, params['SAMPLES_GOOD_DATA_RATIO'])
x = x_y_df.iloc[:, :-1]
y = x_y_df.iloc[:, -1]
x = pp.fill_missing(x)
print(pid + ') Irrelevant samples dropped and missing data filled.')
# Pickle that
x.to_pickle('../../data/x_volume_{}.pkl'.format(pid))
y.to_pickle('../../data/y_volume_{}.pkl'.format(pid))
toc = time()
print('%s) %i intervals generated in: %i seconds.' % (pid, x.shape[0], (toc-tic)))
return pid, x, y
In [40]:
best_params_df = pd.read_pickle('../../data/best_params_final_df.pkl').loc[1,:]
to_drop = [
'model',
'mre',
'r2',
'x_filename',
'y_filename',
'train_days'
]
best_params_df.drop(to_drop, inplace=True)
best_params_df
Out[40]:
In [41]:
generate_one_set(best_params_df)
Out[41]:
In [42]:
x_volume = pd.read_pickle('../../data/x_volume_base112_ahead1.pkl')
print(x_volume.shape)
x_volume.head()
Out[42]:
In [43]:
y_volume = pd.read_pickle('../../data/y_volume_base112_ahead1.pkl')
print(y_volume.shape)
y_volume.head()
Out[43]:
In [45]:
def generate_one_test_set(params, data_df):
# print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
tic = time()
train_val_time = int(params['train_val_time'])
base_days = int(params['base_days'])
step_days = int(params['step_days'])
ahead_days = int(params['ahead_days'])
print('Generating: base{}_ahead{}'.format(base_days, ahead_days))
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Getting the data
today = data_df.index[-1] # Real date
print(pid + ') data_df loaded')
# Drop symbols with many missing points
y_train_df = pd.read_pickle('../../data/y_volume_{}.pkl'.format(pid))
kept_symbols = y_train_df.index.get_level_values(1).unique().tolist()
data_df = data_df.loc[:, (slice(None), kept_symbols)]
print(pid + ') Irrelevant symbols dropped.')
# Generate the intervals for the predictor
x, y = fe.generate_train_intervals(data_df,
train_val_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_volume_one_to_one,
target_feature=fe.VOLUME_FEATURE)
print(pid + ') Intervals generated')
# Drop "bad" samples and fill missing data
x_y_df = pd.concat([x, y], axis=1)
x_y_df = pp.drop_irrelevant_samples(x_y_df, params['SAMPLES_GOOD_DATA_RATIO'])
x = x_y_df.iloc[:, :-1]
y = x_y_df.iloc[:, -1]
x = pp.fill_missing(x)
print(pid + ') Irrelevant samples dropped and missing data filled.')
# Pickle that
x.to_pickle('../../data/x_volume_{}_test.pkl'.format(pid))
y.to_pickle('../../data/y_volume_{}_test.pkl'.format(pid))
toc = time()
print('%s) %i intervals generated in: %i seconds.' % (pid, x.shape[0], (toc-tic)))
return pid, x,
In [46]:
data_test_df = pd.read_pickle('../../data/data_test_df.pkl')
generate_one_test_set(best_params_df, data_test_df)
Out[46]:
In [47]:
x_volume_test = pd.read_pickle('../../data/x_volume_base112_ahead1_test.pkl')
print(x_volume_test.shape)
x_volume_test.head()
Out[47]:
In [48]:
y_volume_test = pd.read_pickle('../../data/y_volume_base112_ahead1_test.pkl')
print(y_volume_test.shape)
y_volume_test.head()
Out[48]:
In [49]:
best_params_df = pd.read_pickle('../../data/best_params_final_df.pkl')
In [50]:
import predictor.feature_extraction as fe
from predictor.linear_predictor import LinearPredictor
import utils.misc as misc
import predictor.evaluation as ev
ahead_days = 1
# Get some parameters
train_days = int(best_params_df.loc[ahead_days, 'train_days'])
GOOD_DATA_RATIO, \
train_val_time, \
base_days, \
step_days, \
ahead_days, \
SAMPLES_GOOD_DATA_RATIO, \
x_filename, \
y_filename = misc.unpack_params(best_params_df.loc[ahead_days,:])
pid = 'base{}_ahead{}'.format(base_days, ahead_days)
# Get the datasets
x_train = pd.read_pickle('../../data/x_volume_{}.pkl'.format(pid))
y_train = pd.read_pickle('../../data/y_volume_{}.pkl'.format(pid))
x_test = pd.read_pickle('../../data/x_volume_{}_test.pkl'.format(pid)).sort_index()
y_test = pd.DataFrame(pd.read_pickle('../../data/y_volume_{}_test.pkl'.format(pid))).sort_index()
# Let's cut the training set to use only the required number of samples
end_date = x_train.index.levels[0][-1]
start_date = fe.add_market_days(end_date, -train_days)
x_sub_df = x_train.loc[(slice(start_date,None),slice(None)),:]
y_sub_df = pd.DataFrame(y_train.loc[(slice(start_date,None),slice(None))])
# Create the estimator and train
estimator = LinearPredictor()
estimator.fit(x_sub_df, y_sub_df)
# Get the training and test predictions
y_train_pred = estimator.predict(x_sub_df)
y_test_pred = estimator.predict(x_test)
# Get the training and test metrics for each symbol
metrics_train = ev.get_metrics_df(y_sub_df, y_train_pred)
metrics_test = ev.get_metrics_df(y_test, y_test_pred)
# Show the mean metrics
metrics_df = pd.DataFrame(columns=['train', 'test'])
metrics_df['train'] = metrics_train.mean()
metrics_df['test'] = metrics_test.mean()
print('Mean metrics: \n{}\n{}'.format(metrics_df,'-'*70))
# Plot the metrics in time
metrics_train_time = ev.get_metrics_in_time(y_sub_df, y_train_pred, base_days + ahead_days)
metrics_test_time = ev.get_metrics_in_time(y_test, y_test_pred, base_days + ahead_days)
plt.plot(metrics_train_time[2], metrics_train_time[0], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[0], label='test', marker='.')
plt.title('$r^2$ metrics')
plt.legend()
plt.figure()
plt.plot(metrics_train_time[2], metrics_train_time[1], label='train', marker='.')
plt.plot(metrics_test_time[2], metrics_test_time[1], label='test', marker='.')
plt.title('MRE metrics')
plt.legend()
Out[50]:
In [51]:
joblib.dump(estimator, '../../data/best_volume_predictor.pkl')
Out[51]:
In [ ]: